diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional.h new file mode 100644 index 0000000000000000000000000000000000000000..675f6dc663bd80f4518ffc301dd10694f5a79f86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional.h @@ -0,0 +1,9 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional_base.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional_base.h new file mode 100644 index 0000000000000000000000000000000000000000..b81c80ac1efbf8ea2d24e9c0d524e12c75a3e061 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional_base.h @@ -0,0 +1,480 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include + +namespace at { +namespace detail { +// We prefer to convert through float for reduced-precision floating +// point types if we have a Vectorized specialization for float and we +// don't have one for the actual type in question. +template +struct should_prefer_converting_through_float + : std::bool_constant< + is_reduced_floating_point_v && + vec::is_vec_specialized_for_v && + !vec::is_vec_specialized_for_v> {}; + +template +constexpr auto should_prefer_converting_through_float_v = + should_prefer_converting_through_float::value; +} // namespace detail + +namespace vec { +// slow path +template +inline scalar_t vec_reduce_all( + const Op& vec_fun, + vec::Vectorized acc_vec, + int64_t size) { + using Vec = vec::Vectorized; + scalar_t acc_arr[Vec::size()]; + acc_vec.store(acc_arr); + for (const auto i : c10::irange(1, size)) { + std::array acc_arr_next = {0}; + acc_arr_next[0] = acc_arr[i]; + Vec acc_vec_next = Vec::loadu(acc_arr_next.data()); + acc_vec = vec_fun(acc_vec, acc_vec_next); + } + acc_vec.store(acc_arr); + return acc_arr[0]; +} + +template +struct VecReduceAllSIMD { + static inline scalar_t apply( + const Op& vec_fun, + const Vectorized& acc_vec) { + return vec_reduce_all(vec_fun, acc_vec, Vectorized::size()); + } +}; + +#if defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && \ + !defined(C10_MOBILE) +#if defined(CPU_CAPABILITY_AVX2) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + // 128-bit shuffle + Vec v1 = _mm256_permute2f128_ps(v, v, 0x1); + v = vec_fun(v, v1); + // 64-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0x4E); + v = vec_fun(v, v1); + // 32-bit shuffle + v1 = _mm256_shuffle_ps(v, v, 0xB1); + v = vec_fun(v, v1); + return _mm256_cvtss_f32(v); + } +}; +#endif // defined(CPU_CAPABILITY_AVX2) +#if defined(CPU_CAPABILITY_AVX512) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + // 256-bit shuffle + Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E); + v = vec_fun(v, v1); + // 128-bit shuffle + v1 = _mm512_shuffle_f32x4(v, v, 0xB1); + v = vec_fun(v, v1); + // 64-bit shuffle + v1 = _mm512_shuffle_ps(v, v, 0x4E); + v = vec_fun(v, v1); + // 32-bit shuffle + v1 = _mm512_shuffle_ps(v, v, 0xB1); + v = vec_fun(v, v1); + return _mm512_cvtss_f32(v); + } +}; +#endif // defined(CPU_CAPABILITY_AVX512) +#endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && + // !defined(C10_MOBILE) + +#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + !defined(CPU_CAPABILITY_SVE) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + + // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, + // a4+a8, a1+a5, a2+a6, -, -, -, -] + float32x4_t v1_1 = vextq_f32(v, v, 2); + Vec v1 = v1_1; + // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] + v = vec_fun(v, v1); + + // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, + // -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, + // -] + v1_1 = vrev64q_f32(v); + v1 = v1_1; + // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, + // a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -] + v = vec_fun(v, v1); + + return v[0]; + } +}; + +template <> +struct VecReduceAllSIMD>> { + static inline float apply( + const std::plus>& vec_fun, + const Vectorized& acc_vec) { + return vaddvq_f32(acc_vec); + } +}; +#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) + // && !defined(CPU_CAPABILITY_SVE) + +#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \ + defined(CPU_CAPABILITY_SVE256) +template +struct VecReduceAllSIMD { + static inline float apply( + const Op& vec_fun, + const Vectorized& acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + // 128-bit shuffle + svuint32_t ind = svdupq_n_u32(4, 5, 6, 7); + Vec v1 = svtbl_f32(v, ind); + v = vec_fun(v, v1); + // 64-bit shuffle + ind = svdupq_n_u32(2, 3, 0, 1); + v1 = svtbl_f32(v, ind); + v = vec_fun(v, v1); + // 32-bit shuffle + ind = svdupq_n_u32(1, 0, 2, 3); + v1 = svtbl_f32(v, ind); + v = vec_fun(v, v1); + return svlasta(svpfalse(), v); + } +}; +#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) + // && defined(CPU_CAPABILITY_SVE256) + +template +inline scalar_t vec_reduce_all( + const Op& vec_fun, + const Vectorized& acc_vec) { + return VecReduceAllSIMD::apply(vec_fun, acc_vec); +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t, int> = 0> +inline scalar_t reduce_all( + const Op& vec_fun, + const scalar_t* data, + int64_t size) { + using Vec = vec::Vectorized; + if (size < Vec::size()) + return vec_reduce_all(vec_fun, Vec::loadu(data, size), size); + int64_t d = Vec::size(); + Vec acc_vec = Vec::loadu(data); + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(data + d); + acc_vec = vec_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(vec_fun, acc_vec); +} + +// similar to reduce_all, but reduces into two outputs +template < + typename scalar_t, + typename Op1, + typename Op2, + typename std::enable_if_t, int> = 0> +inline std::pair reduce2_all( + const Op1& vec_fun1, + const Op2& vec_fun2, + const scalar_t* data, + int64_t size) { + using Vec = vec::Vectorized; + if (size < Vec::size()) { + auto loaded_data = Vec::loadu(data, size); + return std::pair( + vec_reduce_all(vec_fun1, loaded_data, size), + vec_reduce_all(vec_fun2, loaded_data, size)); + } + int64_t d = Vec::size(); + Vec acc_vec1 = Vec::loadu(data); + Vec acc_vec2 = Vec::loadu(data); + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(data + d); + acc_vec1 = vec_fun1(acc_vec1, data_vec); + acc_vec2 = vec_fun2(acc_vec2, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + acc_vec1 = Vec::set(acc_vec1, vec_fun1(acc_vec1, data_vec), size - d); + acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d); + } + return std::pair( + vec_reduce_all(vec_fun1, acc_vec1), vec_reduce_all(vec_fun2, acc_vec2)); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +inline scalar_t map_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + const scalar_t* data, + int64_t size) { + using Vec = vec::Vectorized; + if (size < Vec::size()) + return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size); + int64_t d = Vec::size(); + Vec acc_vec = map_fun(Vec::loadu(data)); + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(data + d); + data_vec = map_fun(data_vec); + acc_vec = red_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + data_vec = map_fun(data_vec); + acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(red_fun, acc_vec); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +inline scalar_t map2_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + const scalar_t* data, + const scalar_t* data2, + int64_t size) { + using Vec = vec::Vectorized; + if (size < Vec::size()) { + Vec data_vec = Vec::loadu(data, size); + Vec data2_vec = Vec::loadu(data2, size); + data_vec = map_fun(data_vec, data2_vec); + return vec_reduce_all(red_fun, data_vec, size); + } + int64_t d = Vec::size(); + Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2)); + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(data + d); + Vec data2_vec = Vec::loadu(data2 + d); + data_vec = map_fun(data_vec, data2_vec); + acc_vec = red_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + Vec data2_vec = Vec::loadu(data2 + d, size - d); + data_vec = map_fun(data_vec, data2_vec); + acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(red_fun, acc_vec); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +inline scalar_t map3_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + const scalar_t* data, + const scalar_t* data2, + const scalar_t* data3, + int64_t size) { + using Vec = vec::Vectorized; + if (size < Vec::size()) { + Vec data_vec = Vec::loadu(data, size); + Vec data2_vec = Vec::loadu(data2, size); + Vec data3_vec = Vec::loadu(data3, size); + data_vec = map_fun(data_vec, data2_vec, data3_vec); + return vec_reduce_all(red_fun, data_vec, size); + } + + int64_t d = Vec::size(); + Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2), Vec::loadu(data3)); + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(data + d); + Vec data2_vec = Vec::loadu(data2 + d); + Vec data3_vec = Vec::loadu(data3 + d); + data_vec = map_fun(data_vec, data2_vec, data3_vec); + acc_vec = red_fun(acc_vec, data_vec); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(data + d, size - d); + Vec data2_vec = Vec::loadu(data2 + d, size - d); + Vec data3_vec = Vec::loadu(data3 + d, size - d); + data_vec = map_fun(data_vec, data2_vec, data3_vec); + acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d); + } + return vec_reduce_all(red_fun, acc_vec); +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v>, + int> = 0> +inline void map( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data, + int64_t size) { + using Vec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec output_vec = vec_fun(Vec::loadu(input_data + d)); + output_vec.store(output_data + d); + } + if (size - d > 0) { + Vec output_vec = vec_fun(Vec::loadu(input_data + d, size - d)); + output_vec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized>, + int> = 0> +inline void map2( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data, + const scalar_t* input_data2, + int64_t size) { + using Vec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec = Vec::loadu(input_data + d); + Vec data_vec2 = Vec::loadu(input_data2 + d); + Vec output_vec = vec_fun(data_vec, data_vec2); + output_vec.store(output_data + d); + } + if (size - d > 0) { + Vec data_vec = Vec::loadu(input_data + d, size - d); + Vec data_vec2 = Vec::loadu(input_data2 + d, size - d); + Vec output_vec = vec_fun(data_vec, data_vec2); + output_vec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>, + int> = 0> +inline void map3( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data1, + const scalar_t* input_data2, + const scalar_t* input_data3, + int64_t size) { + using Vec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec1 = Vec::loadu(input_data1 + d); + Vec data_vec2 = Vec::loadu(input_data2 + d); + Vec data_vec3 = Vec::loadu(input_data3 + d); + Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3); + output_vec.store(output_data + d); + } + if (size - d > 0) { + Vec data_vec1 = Vec::loadu(input_data1 + d, size - d); + Vec data_vec2 = Vec::loadu(input_data2 + d, size - d); + Vec data_vec3 = Vec::loadu(input_data3 + d, size - d); + Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3); + output_vec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>, + int> = 0> +inline void map4( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data1, + const scalar_t* input_data2, + const scalar_t* input_data3, + const scalar_t* input_data4, + int64_t size) { + using Vec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec data_vec1 = Vec::loadu(input_data1 + d); + Vec data_vec2 = Vec::loadu(input_data2 + d); + Vec data_vec3 = Vec::loadu(input_data3 + d); + Vec data_vec4 = Vec::loadu(input_data4 + d); + Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4); + output_vec.store(output_data + d); + } + if (size - d > 0) { + Vec data_vec1 = Vec::loadu(input_data1 + d, size - d); + Vec data_vec2 = Vec::loadu(input_data2 + d, size - d); + Vec data_vec3 = Vec::loadu(input_data3 + d, size - d); + Vec data_vec4 = Vec::loadu(input_data4 + d, size - d); + Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4); + output_vec.store(output_data + d, size - d); + } +} + +} // namespace vec +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional_bfloat16.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional_bfloat16.h new file mode 100644 index 0000000000000000000000000000000000000000..ad7daa651fd0c2a685cd52c5ef03b3994ffe1554 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/functional_bfloat16.h @@ -0,0 +1,652 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include + +namespace at::vec { +// BFloat16 specification +template +struct VecScalarType { + using type = scalar_t; +}; +template <> +struct VecScalarType { + using type = float; +}; +template <> +struct VecScalarType { + using type = float; +}; + +// This is different from at::acc_type since we only need to specialize BFloat16 +template +using vec_scalar_t = typename VecScalarType::type; + +// Vector conversion between float and bfloat16/half +template <> +inline std::tuple, Vectorized> convert_to_float< + BFloat16>(const Vectorized& a) { + return convert_bfloat16_float(a); +} + +template <> +inline std::tuple, Vectorized> convert_to_float( + const Vectorized& a) { + return convert_half_float(a); +} + +template <> +inline Vectorized convert_from_float( + const Vectorized& a, + const Vectorized& b) { + return convert_float_bfloat16(a, b); +} + +template <> +inline Vectorized convert_from_float( + const Vectorized& a, + const Vectorized& b) { + return convert_float_half(a, b); +} + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline void load_to_float( + const scalar_t* data, + Vectorized& out1, + Vectorized& out2); + +template <> +inline void load_to_float( + const BFloat16* data, + Vectorized& out1, + Vectorized& out2) { + load_fp32_from_bf16(data, out1, out2); +} + +template <> +inline void load_to_float( + const Half* data, + Vectorized& out1, + Vectorized& out2) { + load_fp32_from_fp16(data, out1, out2); +} + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline void load_to_float(const scalar_t* data, Vectorized& out); + +template <> +inline void load_to_float( + const BFloat16* data, + Vectorized& out) { + load_fp32_from_bf16(data, out); +} + +template <> +inline void load_to_float(const Half* data, Vectorized& out) { + load_fp32_from_fp16(data, out); +} + +// Note that we already have specialized member of Vectorized for +// BFloat16 so the following functions would run smoothly: +// using Vec = Vectorized; +// Vec one = Vec(BFloat16(1)); +// vec::map([](Vec x) { return one / (one + x.exp()); }, y_ptr, x_ptr, N); +// +// Then why we still need to specialize "functional"? +// If we do specialization at Vectorized<> level, the above example would need +// 3 pairs of conversion of bf16->fp32/fp32->bf16, each for ".exp()", "+" and +// "/". If we do specialization at vec::map<>() level, we have only 1 pair of +// conversion of bf16->fp32/fp32->bf16, for the input and output BFloat16 +// vector only. +// +// The following BFloat16 functionality will only do data type conversion for +// input and output vector (reduce functionality will only convert the final +// scalar back to bf16). Compared to Vectorized<> specialization, +// 1. better performance since we have less data type conversion; +// 2. less rounding error since immediate results are kept in fp32; +// 3. accumulation done on data type of fp32. +// +// If you plan to extend this file, please ensure adding unit tests at +// aten/src/ATen/test/vec_test_all_types.cpp +// +template < + typename scalar_t, + typename Op, + typename std::enable_if_t, int> = 0> +inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + if (size < bVec::size()) { + bVec data_bvec = bVec::loadu(data, size); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + if (size > fVec::size()) { + data_fvec0 = fVec::set( + data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size()); + return vec_reduce_all(vec_fun, data_fvec0, fVec::size()); + } else { + return vec_reduce_all(vec_fun, data_fvec0, size); + } + } + int64_t d = bVec::size(); + bVec acc_bvec = bVec::loadu(data); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + acc_fvec0 = vec_fun(acc_fvec0, data_fvec0); + acc_fvec1 = vec_fun(acc_fvec1, data_fvec1); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + if (size - d > fVec::size()) { + acc_fvec0 = vec_fun(acc_fvec0, data_fvec0); + acc_fvec1 = fVec::set( + acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + acc_fvec0 = + fVec::set(acc_fvec0, vec_fun(acc_fvec0, data_fvec0), size - d); + } + } + acc_fvec0 = vec_fun(acc_fvec0, acc_fvec1); + return vec_reduce_all(vec_fun, acc_fvec0); +} + +template < + typename scalar_t, + typename Op1, + typename Op2, + typename std::enable_if_t, int> = 0> +inline std::pair reduce2_all( + const Op1& vec_fun1, + const Op2& vec_fun2, + const scalar_t* data, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + if (size < bVec::size()) { + bVec data_bvec = bVec::loadu(data, size); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + if (size > fVec::size()) { + fVec acc1_fvec = fVec::set( + data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size()); + fVec acc2_fvec = fVec::set( + data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size()); + return std::pair( + vec_reduce_all(vec_fun1, acc1_fvec, fVec::size()), + vec_reduce_all(vec_fun2, acc2_fvec, fVec::size())); + } else { + return std::pair( + vec_reduce_all(vec_fun1, data_fvec0, size), + vec_reduce_all(vec_fun2, data_fvec0, size)); + } + } + int64_t d = bVec::size(); + bVec acc_bvec = bVec::loadu(data); + auto [acc1_fvec0, acc1_fvec1] = convert_to_float(acc_bvec); + auto [acc2_fvec0, acc2_fvec1] = convert_to_float(acc_bvec); + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0); + acc1_fvec1 = vec_fun1(acc1_fvec1, data_fvec1); + acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0); + acc2_fvec1 = vec_fun2(acc2_fvec1, data_fvec1); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + if (size - d > fVec::size()) { + acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0); + acc1_fvec1 = fVec::set( + acc1_fvec1, + vec_fun1(acc1_fvec1, data_fvec1), + size - d - fVec::size()); + acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0); + acc2_fvec1 = fVec::set( + acc2_fvec1, + vec_fun2(acc2_fvec1, data_fvec1), + size - d - fVec::size()); + } else { + acc1_fvec0 = + fVec::set(acc1_fvec0, vec_fun1(acc1_fvec0, data_fvec0), size - d); + acc2_fvec0 = + fVec::set(acc2_fvec0, vec_fun2(acc2_fvec0, data_fvec0), size - d); + } + } + acc1_fvec0 = vec_fun1(acc1_fvec0, acc1_fvec1); + acc2_fvec0 = vec_fun2(acc2_fvec0, acc2_fvec1); + return std::pair( + vec_reduce_all(vec_fun1, acc1_fvec0), + vec_reduce_all(vec_fun2, acc2_fvec0)); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +inline float map_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + const scalar_t* data, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + if (size < bVec::size()) { + bVec data_bvec = bVec::loadu(data, size); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + if (size > fVec::size()) { + data_fvec0 = map_fun(data_fvec0); + data_fvec1 = map_fun(data_fvec1); + data_fvec0 = fVec::set( + data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); + return vec_reduce_all(red_fun, data_fvec0, fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0); + return vec_reduce_all(red_fun, data_fvec0, size); + } + } + int64_t d = bVec::size(); + bVec acc_bvec = bVec::loadu(data); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); + acc_fvec0 = map_fun(acc_fvec0); + acc_fvec1 = map_fun(acc_fvec1); + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + data_fvec0 = map_fun(data_fvec0); + data_fvec1 = map_fun(data_fvec1); + acc_fvec0 = red_fun(acc_fvec0, data_fvec0); + acc_fvec1 = red_fun(acc_fvec1, data_fvec1); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + if (size - d > fVec::size()) { + data_fvec0 = map_fun(data_fvec0); + data_fvec1 = map_fun(data_fvec1); + acc_fvec0 = red_fun(acc_fvec0, data_fvec0); + acc_fvec1 = fVec::set( + acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0); + acc_fvec0 = + fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); + } + } + acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); + return vec_reduce_all(red_fun, acc_fvec0); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +inline float map2_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + const scalar_t* data, + const scalar_t* data2, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + if (size < bVec::size()) { + bVec data_bvec = bVec::loadu(data, size); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(data2, size); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + if (size > fVec::size()) { + data_fvec0 = map_fun(data_fvec0, data2_fvec0); + data_fvec1 = map_fun(data_fvec1, data2_fvec1); + data_fvec0 = fVec::set( + data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); + return vec_reduce_all(red_fun, data_fvec0, fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0, data2_fvec0); + return vec_reduce_all(red_fun, data_fvec0, size); + } + } + int64_t d = bVec::size(); + bVec acc_bvec = bVec::loadu(data); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); + bVec acc2_bvec = bVec::loadu(data2); + auto [acc2_fvec0, acc2_fvec1] = convert_to_float(acc2_bvec); + acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0); + acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1); + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(data2 + d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + data_fvec0 = map_fun(data_fvec0, data2_fvec0); + data_fvec1 = map_fun(data_fvec1, data2_fvec1); + acc_fvec0 = red_fun(acc_fvec0, data_fvec0); + acc_fvec1 = red_fun(acc_fvec1, data_fvec1); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(data2 + d, size - d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + if (size - d > fVec::size()) { + data_fvec0 = map_fun(data_fvec0, data2_fvec0); + data_fvec1 = map_fun(data_fvec1, data2_fvec1); + acc_fvec0 = red_fun(acc_fvec0, data_fvec0); + acc_fvec1 = fVec::set( + acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0, data2_fvec0); + acc_fvec0 = + fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); + } + } + acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); + return vec_reduce_all(red_fun, acc_fvec0); +} + +template < + typename scalar_t, + typename MapOp, + typename ReduceOp, + typename std::enable_if_t, int> = 0> +inline float map3_reduce_all( + const MapOp& map_fun, + const ReduceOp& red_fun, + const scalar_t* data, + const scalar_t* data2, + const scalar_t* data3, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + if (size < bVec::size()) { + bVec data_bvec = bVec::loadu(data, size); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(data2, size); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(data3, size); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + if (size > fVec::size()) { + data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); + data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); + data_fvec0 = fVec::set( + data_fvec0, red_fun(data_fvec0, data_fvec1), size - fVec::size()); + return vec_reduce_all(red_fun, data_fvec0, fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); + return vec_reduce_all(red_fun, data_fvec0, size); + } + } + int64_t d = bVec::size(); + bVec acc_bvec = bVec::loadu(data); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); + bVec acc2_bvec = bVec::loadu(data2); + auto [acc2_fvec0, acc2_fvec1] = convert_to_float(acc2_bvec); + bVec acc3_bvec = bVec::loadu(data3); + auto [acc3_fvec0, acc3_fvec1] = convert_to_float(acc3_bvec); + acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0, acc3_fvec0); + acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1, acc3_fvec1); + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(data2 + d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(data3 + d); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); + data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); + acc_fvec0 = red_fun(acc_fvec0, data_fvec0); + acc_fvec1 = red_fun(acc_fvec1, data_fvec1); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(data2 + d, size - d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(data3 + d, size - d); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + if (size - d > fVec::size()) { + data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); + data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); + acc_fvec0 = red_fun(acc_fvec0, data_fvec0); + acc_fvec1 = fVec::set( + acc_fvec1, red_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); + } else { + data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); + acc_fvec0 = + fVec::set(acc_fvec0, red_fun(acc_fvec0, data_fvec0), size - d); + } + } + acc_fvec0 = red_fun(acc_fvec0, acc_fvec1); + return vec_reduce_all(red_fun, acc_fvec0); +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v>), + int> = 0> +inline void map( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + fVec output_fvec0 = vec_fun(data_fvec0); + fVec output_fvec1 = vec_fun(data_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(input_data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + fVec output_fvec0 = vec_fun(data_fvec0); + fVec output_fvec1 = vec_fun(data_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t, int> = 0> +inline void map( + const Op& vec_fun, + scalar_t* output_data, + const float* input_data, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + fVec data_fvec0 = fVec::loadu(input_data + d); + fVec data_fvec1 = fVec::loadu(input_data + d + fVec::size()); + fVec output_fvec0 = vec_fun(data_fvec0); + fVec output_fvec1 = vec_fun(data_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d); + } + if (size - d > 0) { + fVec data_fvec0, data_fvec1; + if (size - d > fVec::size()) { + data_fvec0 = fVec::loadu(input_data + d); + data_fvec1 = + fVec::loadu(input_data + d + fVec::size(), size - d - fVec::size()); + } else { + // choose to align with behaviour of bVec::loadu(ptr, size), + // which leaves data_fvec1 uninitialized + data_fvec0 = fVec::loadu(input_data + d, size - d); + } + fVec output_fvec0 = vec_fun(data_fvec0); + fVec output_fvec1 = vec_fun(data_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized>), + int> = 0> +inline void map2( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data, + const scalar_t* input_data2, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data_bvec = bVec::loadu(input_data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(input_data2 + d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0); + fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d); + } + if (size - d > 0) { + bVec data_bvec = bVec::loadu(input_data + d, size - d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + bVec data2_bvec = bVec::loadu(input_data2 + d, size - d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0); + fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>), + int> = 0> +inline void map3( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data1, + const scalar_t* input_data2, + const scalar_t* input_data3, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data1_bvec = bVec::loadu(input_data1 + d); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); + bVec data2_bvec = bVec::loadu(input_data2 + d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(input_data3 + d); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0); + fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d); + } + if (size - d > 0) { + bVec data1_bvec = bVec::loadu(input_data1 + d, size - d); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); + bVec data2_bvec = bVec::loadu(input_data2 + d, size - d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(input_data3 + d, size - d); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0); + fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d, size - d); + } +} + +template < + typename scalar_t, + typename Op, + typename std::enable_if_t< + !(!detail::should_prefer_converting_through_float_v && + std::is_invocable_v< + Op, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized, + vec::Vectorized>), + int> = 0> +inline void map4( + const Op& vec_fun, + scalar_t* output_data, + const scalar_t* input_data1, + const scalar_t* input_data2, + const scalar_t* input_data3, + const scalar_t* input_data4, + int64_t size) { + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + int64_t d = 0; + for (; d < size - (size % bVec::size()); d += bVec::size()) { + bVec data1_bvec = bVec::loadu(input_data1 + d); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); + bVec data2_bvec = bVec::loadu(input_data2 + d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(input_data3 + d); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + bVec data4_bvec = bVec::loadu(input_data4 + d); + auto [data4_fvec0, data4_fvec1] = convert_to_float(data4_bvec); + fVec output_fvec0 = + vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); + fVec output_fvec1 = + vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d); + } + if (size - d > 0) { + bVec data1_bvec = bVec::loadu(input_data1 + d, size - d); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); + bVec data2_bvec = bVec::loadu(input_data2 + d, size - d); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); + bVec data3_bvec = bVec::loadu(input_data3 + d, size - d); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); + bVec data4_bvec = bVec::loadu(input_data4 + d, size - d); + auto [data4_fvec0, data4_fvec1] = convert_to_float(data4_bvec); + fVec output_fvec0 = + vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); + fVec output_fvec1 = + vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); + bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); + output_bvec.store(output_data + d, size - d); + } +} + +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/intrinsics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..fd3d3a65215450308a807f98d28b701f28e2ff22 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/intrinsics.h @@ -0,0 +1,6 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec.h new file mode 100644 index 0000000000000000000000000000000000000000..c5c4fb5c289aeb2f3c54172adbc614aebf490e4c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec.h @@ -0,0 +1,62 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#if defined(CPU_CAPABILITY_AVX512) +#include +#else +#include +#include +#endif + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +inline Vectorized convert_to_bool(Vectorized x) { + __at_align__ bool buffer[x.size()]; + x.ne(Vectorized(0)).store(buffer); + + Vectorized ret; + static_assert(x.size() == ret.size()); + std::memcpy(ret, buffer, ret.size() * sizeof(bool)); + return ret; +} + +template <> +inline Vectorized Vectorized::loadu(const void* ptr) { + // See NOTE [Loading boolean values] + return convert_to_bool(Vectorized::loadu(ptr)); +} + +template <> +inline Vectorized Vectorized::loadu( + const void* ptr, + int64_t count) { + // See NOTE [Loading boolean values] + return convert_to_bool(Vectorized::loadu(ptr, count)); +} + +template +struct VecHoldType { + using hold_type = typename VT::value_type; +}; + +template <> +struct VecHoldType> { + using hold_type = BFloat16; +}; + +template <> +struct VecHoldType> { + using hold_type = Half; +}; + +template +using vechold_type = typename VecHoldType::hold_type; + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_half_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_half_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..ad49d388341c6e8f470bff7fde35ea404e0b83de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_half_neon.h @@ -0,0 +1,627 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include +#include +#include +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +// Right now contains only aarch64 implementation. +// Due to follow two reasons aarch32 is not currently supported. +// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics +// that work for aarch64 dont work for aarch32. +// 2. Android NDK r21 has problems with compiling aarch32. +// Clang seg faults. +// https://github.com/android/ndk/issues/1248 +// https://bugs.llvm.org/show_bug.cgi?id=45824 +// Most likely we will do aarch32 support with inline asm. +#if !defined(C10_MOBILE) && defined(__aarch64__) + +#ifdef __BIG_ENDIAN__ +#error "Big endian is not supported." +#endif + +template +struct BlendHalfRegs { + static float16x8_t impl( + const float16x8_t& a, + const float16x8_t& b, + float16x8_t& res); +}; + +template +struct BlendHalfRegs { + static float16x8_t impl( + const float16x8_t& a, + const float16x8_t& b, + float16x8_t& res) { + return vsetq_lane_f16(vgetq_lane_f16(b, index), res, index); + } +}; + +template +struct BlendHalfRegs { + static float16x8_t impl( + const float16x8_t& a, + const float16x8_t& b, + float16x8_t& res) { + return vsetq_lane_f16(vgetq_lane_f16(a, index), res, index); + } +}; + +template <> +struct is_vec_specialized_for : std::bool_constant {}; + +// On ARM, Half type supports float16_t->Half constructor and Half->float16_t +// conversion +template <> +class Vectorized : public Vectorized16< + float16x8_t, + c10::Half, + BlendHalfRegs, + Vectorized> { + using Base = Vectorized16< + float16x8_t, + c10::Half, + BlendHalfRegs, + Vectorized>; + friend Base; + + private: + // We use these private map functions to implement various methods + Vectorized map_with_vec_float_method( + Vectorized (Vectorized::*m)() const) const { + float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values)); + float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values)); + Vectorized mv0 = (Vectorized(v00).*m)(); + Vectorized mv1 = (Vectorized(v01).*m)(); + float16x4_t r00 = vcvt_f16_f32(mv0); + float16x4_t r01 = vcvt_f16_f32(mv1); + return Vectorized(vcombine_f16(r00, r01)); + } + + Vectorized map2_with_vec_float_method( + const Vectorized& second, + Vectorized (Vectorized::*m)(const Vectorized&) + const) const { + float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values)); + float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values)); + float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.values)); + float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.values)); + Vectorized mv0 = + (Vectorized(v00).*m)(Vectorized(second_v00)); + Vectorized mv1 = + (Vectorized(v01).*m)(Vectorized(second_v01)); + float16x4_t r00 = vcvt_f16_f32(mv0); + float16x4_t r01 = vcvt_f16_f32(mv1); + + // Pack result into Vectorized + return Vectorized(vcombine_f16(r00, r01)); + } + + Vectorized map2_bitmask_with_vec_float_method( + const Vectorized& second, + Vectorized (Vectorized::*m)(const Vectorized&) + const) const { + float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values)); + float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values)); + float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.values)); + float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.values)); + Vectorized mv0 = + (Vectorized(v00).*m)(Vectorized(second_v00)); + Vectorized mv1 = + (Vectorized(v01).*m)(Vectorized(second_v01)); + // Assume the operator returns a bitmask, not "real" floats, and + // just narrow the bits. All-ones is a NaN and will get mangled by + // conversion! + float16x4_t r00 = + vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv0))); + float16x4_t r01 = + vreinterpret_f16_u16(vmovn_u32(vreinterpretq_u32_f32(mv1))); + + // Pack result into Vectorized + return Vectorized(vcombine_f16(r00, r01)); + } + + public: + using Vectorized16::Vectorized16; + + Vectorized() = default; + + // A ctor that accepts c10::Half is needed to fit interface with vec_base.h + // A second constructor that takes float16_t is also included + Vectorized(c10::Half val) : Vectorized((float16_t)val) {} + Vectorized(float16_t val) : Vectorized16(vdupq_n_f16(val)) {} + Vectorized( + value_type val0, + value_type val1, + value_type val2, + value_type val3, + value_type val4, + value_type val5, + value_type val6, + value_type val7) + : Vectorized16( + float16x8_t{val0, val1, val2, val3, val4, val5, val6, val7}) {} + + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // Note: using blendv is very awkward because 0xFFFF is one of + // many NaN's in FP16 It's unfortunate that the mask has type Half + // (required from vec_base) + + // TODO + // NB: This requires that each value, i.e., each uint value, + // of the mask either all be zeros or all be 1s. + // We perhaps need some kind of an assert? + // But that will affect performance. + + // NOTE [vbslq_f16]: vbslq_f16 doesn't work on clang without + // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC. vbslq_u16 generates the + // same instruction anyway. see https://godbolt.org/z/cY4a55Y7P + Vectorized vec(mask.values); + vec.values = vreinterpretq_f16_u16(vbslq_u16( + vreinterpretq_u16_f16(vec.values), + vreinterpretq_u16_f16(b.values), + vreinterpretq_u16_f16(a.values))); + return vec; + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + uint16_t pre_mask[size()] = {0}; + for (int i = 0; i < count; i++) { + pre_mask[i] = 0xFFFF; + } + uint16x8_t mask = vld1q_u16(pre_mask); + + // Using blendv is awkward because 0xFFFF is one of many NaN's in FP16 + // so we directly use vbslq_u16 instead. (See NOTE [vbslq_f16] above.) + Vectorized vec(vreinterpretq_f16_u16(vbslq_u16( + mask, + vreinterpretq_u16_f16(b.values), + vreinterpretq_u16_f16(a.values)))); + + return vec; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) { + return vld1q_f16(reinterpret_cast(ptr)); + } + __at_align__ float16_t tmp_values[size()]; + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(float16_t)); + return vld1q_f16(reinterpret_cast(tmp_values)); + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + vst1q_f16(reinterpret_cast(ptr), values); + return; + } else { + float16_t tmp_values[size()]; + vst1q_f16(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(float16_t)); + } + } + int zero_mask() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + uint16x8_t is_zero_vec = vceqzq_f16(values); + const int16x8_t shift = vcombine_s16( + vcreate_s16( + 0x0 | (int64_t(0x1) << 16) | (int64_t(0x2) << 32) | + (int64_t(0x3) << 48)), + vcreate_s16( + 0x4 | (int64_t(0x5) << 16) | (int64_t(0x6) << 32) | + (int64_t(0x7) << 48))); + uint16x8_t bits_vec = + vshlq_u16(vandq_u16(is_zero_vec, vdupq_n_u16(1)), shift); + return vaddvq_u16(bits_vec); +#else // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + // use known working implementation. + __at_align__ value_type tmp[size()]; + store(tmp); + int mask = 0; + for (int i = 0; i < size(); ++i) { + if (tmp[i] == 0) { + mask |= (1 << i); + } + } + return mask; +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + } + Vectorized isnan() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, values))); +#else + // NOTE: we could make this faster by doing vectorized checks of + // exponent/payload bits. + __at_align__ c10::Half tmp[size()]; + __at_align__ c10::Half res[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i])) { + std::memset(static_cast(&res[i]), 0xFF, sizeof(c10::Half)); + } else { + std::memset(static_cast(&res[i]), 0, sizeof(c10::Half)); + } + } + return loadu(res); +#endif + } + bool has_inf_nan() const { + __at_align__ c10::Half tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i]) || _isinf(tmp[i])) { + return true; + } + } + return false; + } + Vectorized abs() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vabsq_f16(values)); +#else + return map_with_vec_float_method(&Vectorized::abs); +#endif + } + Vectorized frac() const; + Vectorized neg() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vnegq_f16(values)); +#else + return map_with_vec_float_method(&Vectorized::neg); +#endif + } + Vectorized trunc() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vrndq_f16(values)); +#else + return map_with_vec_float_method(&Vectorized::trunc); +#endif + } + Vectorized sqrt() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vsqrtq_f16(values)); +#else + return map_with_vec_float_method(&Vectorized::sqrt); +#endif + } + Vectorized reciprocal() const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + auto ones = vdupq_n_f16(1.0f); + return Vectorized(vdivq_f16(ones, values)); +#else + return map_with_vec_float_method(&Vectorized::reciprocal); +#endif + } + Vectorized operator==(const Vectorized& other) const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized( + vreinterpretq_f16_u16(vceqq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator==); +#endif + } + + Vectorized operator!=(const Vectorized& other) const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized( + vreinterpretq_f16_u16(vmvnq_u16(vceqq_f16(values, other.values)))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator!=); +#endif + } + + Vectorized operator<(const Vectorized& other) const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized( + vreinterpretq_f16_u16(vcltq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator<); +#endif + } + + Vectorized operator<=(const Vectorized& other) const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized( + vreinterpretq_f16_u16(vcleq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator<=); +#endif + } + + Vectorized operator>(const Vectorized& other) const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized( + vreinterpretq_f16_u16(vcgtq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator>); +#endif + } + + Vectorized operator>=(const Vectorized& other) const { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized( + vreinterpretq_f16_u16(vcgeq_f16(values, other.values))); +#else + return map2_bitmask_with_vec_float_method( + other, &Vectorized::operator>=); +#endif + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; // Vectorized + +inline std::tuple, Vectorized> convert_half_float( + const Vectorized& a) { + static_assert(Vectorized::size() == 2 * Vectorized::size()); + float16x8_t x = a; + float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x)); + float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x)); + return {Vectorized(x1), Vectorized(x2)}; +} +inline Vectorized convert_float_half( + const Vectorized& a, + const Vectorized& b) { + static_assert(Vectorized::size() == 2 * Vectorized::size()); + float32x4_t x = a; + float32x4_t y = b; + float16x4_t x1 = vcvt_f16_f32(x); + float16x4_t x2 = vcvt_f16_f32(y); + return Vectorized(vcombine_f16(x1, x2)); +} + +template +Vectorized binary_operator_via_float( + Op op, + const Vectorized& a, + const Vectorized& b) { + const auto [a_float_low, a_float_high] = convert_half_float(a); + const auto [b_float_low, b_float_high] = convert_half_float(b); + return convert_float_half( + op(a_float_low, b_float_low), op(a_float_high, b_float_high)); +} + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vaddq_f16(a, b)); +#else + return binary_operator_via_float(std::plus>(), a, b); +#endif +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vsubq_f16(a, b)); +#else + return binary_operator_via_float(std::minus>(), a, b); +#endif +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vmulq_f16(a, b)); +#else + return binary_operator_via_float(std::multiplies>(), a, b); +#endif +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vdivq_f16(a, b)); +#else + return binary_operator_via_float(std::divides>(), a, b); +#endif +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vmaxq_f16(a, b)); +#else + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&maximum), + a, + b); +#endif +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vminq_f16(a, b)); +#else + return binary_operator_via_float( + static_cast (*)( + const Vectorized&, const Vectorized&)>(&minimum), + a, + b); +#endif +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f16_u16( + vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f16_u16( + vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + return Vectorized(vreinterpretq_f16_u16( + veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)))); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vfmaq_f16(c, a, b)); +#else + return a * b + c; +#endif +} + +template <> +Vectorized inline fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vfmsq_f16(c, a, b)); +#else + return -a * b + c; +#endif +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vnegq_f16(vfmsq_f16(c, a, b))); +#else + return a * b - c; +#endif +} + +template <> +Vectorized inline fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + return Vectorized(vnegq_f16(vfmaq_f16(c, a, b))); +#else + return -a * b - c; +#endif +} +#endif // !defined(C10_MOBILE) && defined(__aarch64__) + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h new file mode 100644 index 0000000000000000000000000000000000000000..3c6e2cc667d373343de56c1dbb0bfa7c28d99f39 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h @@ -0,0 +1,316 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// Shared code for bfloat16 and float16. + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +// Shared implementation between Vectorized and +// Vectorized. Uses CRTP to allow derived class +// customization. +template < + typename VecT, + typename ValueT, + template typename BlendRegs, + typename Derived> +struct Vectorized16 { + protected: + VecT values; + + public: + using value_type = ValueT; + using size_type = int; + static constexpr size_type size() { + static_assert(sizeof(VecT) == 8 * sizeof(value_type)); + return 8; + } + + protected: + Derived map2( + const Derived& second, + value_type (*const f)(value_type, value_type)) const { + __at_align__ value_type tmp_first[size()]; + __at_align__ value_type tmp_second[size()]; + static_cast(this)->store( + tmp_first); // store this to tmp_first + second.store(tmp_second); + for (const auto i : c10::irange(size())) { + tmp_first[i] = f(tmp_first[i], tmp_second[i]); + } + return Derived::loadu(tmp_first); + } + + public: + Vectorized16() = default; + Vectorized16(VecT v) : values(v) {} + + operator VecT() const { + return values; + } + + template + static Derived blend(const Derived& a, const Derived& b) { + Derived vec; + vec.values = BlendRegs < 0, + (mask & 0x01) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 1, + (mask & 0x02) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 2, + (mask & 0x04) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 3, + (mask & 0x08) != 0 > ::impl(a.values, b.values, vec.values); + + vec.values = BlendRegs < 4, + (mask & 0x10) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 5, + (mask & 0x20) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 6, + (mask & 0x40) != 0 > ::impl(a.values, b.values, vec.values); + vec.values = BlendRegs < 7, + (mask & 0x80) != 0 > ::impl(a.values, b.values, vec.values); + + return vec; + } + + template + static Derived arange( + value_type base = 0, + step_t step = static_cast(1)) { + const Derived base_vec(base); + const Derived step_vec(step); + const Derived step_sizes( + value_type(0), + value_type(1), + value_type(2), + value_type(3), + value_type(4), + value_type(5), + value_type(6), + value_type(7)); + return fmadd(step_sizes, step_vec, base_vec); + } + + // Very slow implementation of indexing. + // Only required because vec256_qint refers to this. + // Once we specialize that implementation for ARM + // this should be removed. TODO (kimishpatel) + value_type operator[](int idx) const { + __at_align__ value_type tmp[size()]; + static_cast(this)->store(tmp); + return tmp[idx]; + } + + int zero_mask() const { + __at_align__ value_type tmp[size()]; + static_cast(this)->store(tmp); + int mask = 0; + for (int i = 0; i < size(); ++i) { + if (tmp[i] == 0) { + mask |= (1 << i); + } + } + return mask; + } + + Derived map(value_type (*const f)(value_type)) const { + __at_align__ value_type tmp[size()]; + static_cast(this)->store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return Derived::loadu(tmp); + } + + Derived angle() const { + auto zero = Derived(0); + auto pi = Derived(c10::pi); + auto tmp = + Derived::blendv(zero, pi, *static_cast(this) < zero); + return Derived::blendv( + tmp, + *static_cast(this), + static_cast(this)->isnan()); + } + Derived real() const { + return *this; + } + Derived imag() const { + return Derived(0); + } + Derived conj() const { + return *this; + } + + // Sleef does not support FP16/BF16, so many math functions are applied by + // converting to FP32, applying the math function, and then converting back to + // FP16/BF16. + Derived acos() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::acos); + } + Derived acosh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::acosh); + } + Derived asin() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::asin); + } + Derived asinh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::asinh); + } + Derived atan() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::atan); + } + Derived atanh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::atanh); + } + Derived atan2(const Derived& exp) const { + return static_cast(this)->map2_with_vec_float_method( + exp, &Vectorized::atan2); + } + Derived copysign(const Derived& sign) const { + return static_cast(this)->map2_with_vec_float_method( + sign, &Vectorized::copysign); + } + Derived erf() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::erf); + } + Derived erfc() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::erfc); + } + Derived erfinv() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::erfinv); + } + Derived exp() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp); + } + Derived exp2() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp2); + } + Derived expm1() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::expm1); + } + Derived exp_u20() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp_u20); + } + Derived fexp_u20() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::exp_u20); + } + Derived fmod(const Derived& q) const { + // This function is questionable with a conversion, so we use map2 + return map2(q, std::fmod); + } + Derived hypot(const Derived& b) const { + return static_cast(this)->map2_with_vec_float_method( + b, &Vectorized::hypot); + } + Derived i0() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::i0); + } + Derived i0e() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::i0e); + } + Derived digamma() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::digamma); + } + Derived igamma(const Derived& x) const { + return static_cast(this)->map2_with_vec_float_method( + x, &Vectorized::igamma); + } + Derived igammac(const Derived& x) const { + return static_cast(this)->map2_with_vec_float_method( + x, &Vectorized::igammac); + } + Derived log() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log); + } + Derived log10() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log10); + } + Derived log1p() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log1p); + } + Derived log2() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::log2); + } + Derived nextafter(const Derived& b) const { + // This function does not make sense with conversion, so we use map2 + return map2(b, std::nextafter); + } + Derived sin() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::sin); + } + Derived sinh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::sinh); + } + Derived cos() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::cos); + } + Derived cosh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::cosh); + } + Derived ceil() const { + // This function is questionable with a conversion, so we use map + return map(at::native::ceil_impl); + } + Derived floor() const { + // This function is questionable with a conversion, so we use map + return map(at::native::floor_impl); + } + Derived round() const { + // This function is questionable with a conversion, so we use map + return map(at::native::round_impl); + } + Derived tan() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::tan); + } + Derived tanh() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::tanh); + } + Derived lgamma() const { + return static_cast(this)->map_with_vec_float_method( + &Vectorized::lgamma); + } + Derived rsqrt() const { + return static_cast(this)->sqrt().reciprocal(); + } + Derived pow(const Derived& exp) const { + return static_cast(this)->map2_with_vec_float_method( + exp, &Vectorized::pow); + } +}; + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_base.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_base.h new file mode 100644 index 0000000000000000000000000000000000000000..3f06f3fc806c9056c0e8361a320b69c0d2003ba5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_base.h @@ -0,0 +1,1537 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#if defined(__GNUC__) && __GNUC__ == 10 && __GNUC_MINOR__ <= 2 && \ + defined(__ARM_FEATURE_SVE) +// Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117161 +#pragma GCC optimize("no-tree-vectorize") +#endif + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] +// +// Note [Do not compile initializers with AVX] +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// If you define a static initializer in this file, the initialization will use +// AVX instructions because these object files are compiled with AVX enabled. +// We need to avoid non-trivial global data in these architecture specific files +// because there's no way to guard the global initializers with CPU capability +// detection. +// +// See https://github.com/pytorch/pytorch/issues/37577 for an instance +// of this bug in the past. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__GNUC__) +#define __FORCE_INLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define __FORCE_INLINE __forceinline +#endif + +#if defined(_MSC_FULL_VER) +/* +https://learn.microsoft.com/en-us/cpp/overview/compiler-versions?view=msvc-170 +Use _MSC_FULL_VER to identify current compiler is msvc, +Windows llvm will not have this definition. +*/ +#define __msvc_cl__ +#endif + +// These macros helped us unify vec_base.h +#ifdef CPU_CAPABILITY_AVX512 +#if defined(__GNUC__) +#define __at_align__ __attribute__((aligned(64))) +#elif defined(_WIN32) +#define __at_align__ __declspec(align(64)) +#else +#define __at_align__ +#endif +#define VECTOR_WIDTH 64 +#define int_vector __m512i +#elif defined(__aarch64__) && \ + !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512 +// SVE code expects 256-vectors; leave that set for SVE? +#if defined(__GNUC__) +#define __at_align__ __attribute__((aligned(16))) +#elif defined(_WIN32) +#define __at_align__ __declspec(align(16)) +#else +#define __at_align__ +#endif +#define VECTOR_WIDTH 16 +#else // CPU_CAPABILITY_AVX512 +#if defined(__GNUC__) +#define __at_align__ __attribute__((aligned(32))) +#elif defined(_WIN32) +#define __at_align__ __declspec(align(32)) +#else +#define __at_align__ +#endif +#define VECTOR_WIDTH 32 +#define int_vector __m256i +#endif // CPU_CAPABILITY_AVX512 + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { +// at::Half and at::BFloat16 should be treated as floating point +template +struct is_floating_point + : std::integral_constant< + bool, + std::is_floating_point_v || std::is_same_v || + std::is_same_v> {}; + +template +constexpr bool is_floating_point_v = is_floating_point::value; + +template +struct is_reduced_floating_point + : std::integral_constant< + bool, + std::is_same_v || std::is_same_v> {}; + +template +constexpr bool is_reduced_floating_point_v = + is_reduced_floating_point::value; + +template +struct is_8bit_integer + : std::integral_constant< + bool, + std::is_same_v || std::is_same_v> { +}; + +template +constexpr bool is_8bit_integer_v = is_8bit_integer::value; + +template +struct int_of_size; + +#define DEFINE_INT_OF_SIZE(int_t) \ + template <> \ + struct int_of_size { \ + using type = int_t; \ + } + +DEFINE_INT_OF_SIZE(int64_t); +DEFINE_INT_OF_SIZE(int32_t); +DEFINE_INT_OF_SIZE(int16_t); +DEFINE_INT_OF_SIZE(int8_t); + +#undef DEFINE_INT_OF_SIZE + +template +using int_same_size_t = typename int_of_size::type; + +/** + * Detect at compile time whether Vectorized has an explicit + * specialization for T. (You are required to specialize this type + * whenever you specialize Vectorized). Useful for generic algorithms + * to decide whether to rely on a specialization being fast. For + * example, they might choose to handle reduced-precision floating + * point types directly if they're supported, or convert through float + * if not. + */ +#if defined(__s390x__) +template +#else +template +#endif +struct is_vec_specialized_for : std::bool_constant { +}; + +template +constexpr bool is_vec_specialized_for_v = is_vec_specialized_for::value; + +// NOTE: If you specialize Vectorized on a type, you must define all +// operations! You must also specialize is_vec_specialized_for for +// that type. + +// emulates Vectorized types +#if defined(__s390x__) +template +#else +template +#endif +struct Vectorized { + private: + __at_align__ T values[VECTOR_WIDTH / sizeof(T)]; + + public: + using value_type = T; + using size_type = int; + + static constexpr size_type kSize = VECTOR_WIDTH / sizeof(T); + static constexpr size_type size() { + return kSize; + } + Vectorized() : values{static_cast(0)} {} + Vectorized(T val) { + for (int i = 0; i != size(); i++) { + values[i] = val; + } + } + template < + typename... Args, + typename = std::enable_if_t<(sizeof...(Args) == size())>> + Vectorized(Args... vals) : values{vals...} {} + Vectorized(const T (&arr)[kSize]) { + std::memcpy(values, arr, sizeof(values)); + } + // This also implies const T& operator[](int idx) const + inline operator const T*() const { + return values; + } + // This also implies T& operator[](int idx) + inline operator T*() { + return values; + } + // Return the values as char* for type punning + auto as_bytes() const -> const char* { + return reinterpret_cast(values); + } + template + static Vectorized blend(const Vectorized& a, const Vectorized& b) { + int64_t mask = mask_; + Vectorized vector; + for (const auto i : c10::irange(size())) { + if (mask & 0x01) { + vector[i] = b[i]; + } else { + vector[i] = a[i]; + } + mask = mask >> 1; + } + return vector; + } +// Workaround for https: //gcc.gnu.org/bugzilla/show_bug.cgi?id=117001 +#if __GNUC__ <= 12 && !defined(__clang__) && defined(__ARM_FEATURE_SVE) + static Vectorized __attribute__((optimize("-fno-tree-loop-vectorize"))) + blendv( + const Vectorized& a, +#else + static Vectorized blendv( + const Vectorized& a, +#endif + const Vectorized& b, + const Vectorized& mask) { + Vectorized vector; + int_same_size_t buffer[size()]; + mask.store(buffer); + for (const auto i : c10::irange(size())) { + if (buffer[i] & 0x01) { + vector[i] = b[i]; + } else { + vector[i] = a[i]; + } + } + return vector; + } + template // step sometimes requires a higher precision type + // (e.g., T=int, step_t=double) + static Vectorized arange( + T base = static_cast(0), + step_t step = static_cast(1)) { + Vectorized vector; + for (const auto i : c10::irange(size())) { + vector.values[i] = base + i * step; + } + return vector; + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + Vectorized vector; + for (const auto i : c10::irange(size())) { + if (i < count) { + vector[i] = b[i]; + } else { + vector[i] = a[i]; + } + } + return vector; + } + static Vectorized loadu(const void* ptr) { + Vectorized vector; + std::memcpy(vector.values, ptr, VECTOR_WIDTH); + return vector; + } + static Vectorized loadu(const void* ptr, int64_t count) { + Vectorized vector; + std::memcpy(vector.values, ptr, count * sizeof(T)); + return vector; + } + static Vectorized loadu_one_fourth(const void* ptr) { + static_assert( + std::is_same_v || std::is_same_v, + "For byte types only"); + return Vectorized::loadu(ptr, 8); + } + + void store(void* ptr, int count = size()) const { + std::memcpy(ptr, values, count * sizeof(T)); + } + int zero_mask() const { + // returns an integer mask where all zero elements are translated to 1-bit + // and others are translated to 0-bit + int mask = 0; + for (int i = 0; i < size(); ++i) { + if (values[i] == static_cast(0)) { + mask |= (1 << i); + } + } + return mask; + } + Vectorized isnan() const { + Vectorized vector; + for (int64_t i = 0; i != size(); i++) { + if (_isnan(values[i])) { + std::memset(static_cast(vector.values + i), 0xFF, sizeof(T)); + } else { + std::memset(static_cast(vector.values + i), 0, sizeof(T)); + } + } + return vector; + } + bool has_inf_nan() const { + for (int64_t i = 0; i != size(); i++) { + if (_isnan(values[i]) || _isinf(values[i])) { + return true; + } + } + return false; + } +// MSVC versions between 14.36 and 14.42 has a loop unrolling bug on Windows +// Arm64 +// See +// https://developercommunity.visualstudio.com/t/MSVC-loop-unrolling-problem-194033813-/10720692 +#if defined(_WIN32) && defined(__aarch64__) && \ + ((_MSVC_VER >= 1936) && (_MSVC_VER <= 1942)) + Vectorized map(T (*const f)(T)) const { + Vectorized ret; + for (int64_t i = 0; i < size(); i++) { + ret[i] = f(values[i]); + if (++i < size()) + ret[i] = f(values[i]); + } + return ret; + } + T reduce(T (*const f)(T)) const { + T ret = 0; + for (int64_t i = 0; i < size(); i++) { + ret = f(ret, values[i]); + if (++i < size()) + ret = f(ret, values[i]); + } + return ret; + } +#else + Vectorized map(T (*const f)(T)) const { + Vectorized ret; + for (int64_t i = 0; i != size(); i++) { + ret[i] = f(values[i]); + } + return ret; + } + T reduce(T (*const f)(T)) const { + T ret = 0; + for (int64_t i = 0; i != size(); i++) { + ret = f(ret, values[i]); + } + return ret; + } +#endif + Vectorized map(T (*const f)(const T&)) const { + Vectorized ret; + for (int64_t i = 0; i != size(); i++) { + ret[i] = f(values[i]); + } + return ret; + } + T reduce(T (*const f)(const T&)) const { + T ret = 0; + for (int64_t i = 0; i != size(); i++) { + ret = f(ret, values[i]); + } + return ret; + } + template < + typename other_t_abs = T, + typename std::enable_if_t< + !is_floating_point_v && + !c10::is_complex::value, + int> = 0> + Vectorized abs() const { + // other_t_abs is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_abs must be T"); + return map([](T x) -> T { return x < static_cast(0) ? -x : x; }); + } + template < + typename float_t_abs = T, + typename std::enable_if_t, int> = 0> + Vectorized abs() const { + // float_t_abs is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "float_t_abs must be T"); + // Specifically deal with floating-point because the generic code above + // won't handle -0.0 (which should result in 0.0) properly. + return map([](T x) -> T { return std::abs(x); }); + } + template < + typename complex_t_abs = T, + typename std::enable_if_t::value, int> = 0> + Vectorized abs() const { + // complex_t_abs is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "complex_t_abs must be T"); + // Specifically map() does not perform the type conversion needed by abs. + return map([](T x) { return static_cast(std::abs(x)); }); + } + + template < + typename other_t_sgn = T, + typename std::enable_if_t::value, int> = 0> + Vectorized sgn() const { + return map(at::native::sgn_impl); + } + + template < + typename other_t_angle = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized angle() const { + // other_t_angle is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_angle must be T"); + return map(at::native::angle_impl); // compiler is unable to resolve the + // overload without + } + template < + typename complex_t_angle = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized angle() const { + // complex_t_angle is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_angle must be T"); + return map([](T x) { return static_cast(std::arg(x)); }); + } + template < + typename other_t_real = T, + typename std::enable_if_t::value, int> = 0> + Vectorized real() const { + // other_t_real is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_real must be T"); + return *this; + } + template < + typename complex_t_real = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized real() const { + // complex_t_real is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_real must be T"); + return map([](T x) { return static_cast(x.real()); }); + } + template < + typename other_t_imag = T, + typename std::enable_if_t::value, int> = 0> + Vectorized imag() const { + // other_t_imag is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_imag must be T"); + return Vectorized(0); + } + template < + typename complex_t_imag = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized imag() const { + // complex_t_imag is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_imag must be T"); + return map([](T x) { return static_cast(x.imag()); }); + } + template < + typename other_t_conj = T, + typename std::enable_if_t::value, int> = 0> + Vectorized conj() const { + // other_t_conj is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_conj must be T"); + return *this; + } + template < + typename complex_t_conj = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized conj() const { + // complex_t_conj is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_conj must be T"); + return map([](T x) { return static_cast(std::conj(x)); }); + } + Vectorized acos() const { + return map(std::acos); + } + Vectorized acosh() const { + return map(std::acosh); + } + Vectorized asin() const { + return map(std::asin); + } + Vectorized asinh() const { + return map(std::asinh); + } + Vectorized atan() const { + return map(std::atan); + } + Vectorized atanh() const { + return map(std::atanh); + } + Vectorized atan2(const Vectorized& exp) const { + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = std::atan2(values[i], exp[i]); + } + return ret; + } + template < + typename U = T, + typename std::enable_if_t, int> = 0> + Vectorized copysign(const Vectorized& sign) const { + Vectorized ret; + for (size_type i = 0; i < size(); i++) { + ret[i] = c10::copysign(values[i], sign[i]); + } + return ret; + } + Vectorized erf() const { + return map(std::erf); + } + Vectorized erfc() const { + return map(std::erfc); + } + Vectorized erfinv() const { + return map(calc_erfinv); + } + Vectorized exp() const { + return map(std::exp); + } + Vectorized exp2() const { + return map(exp2_impl); + } + Vectorized expm1() const { + return map(std::expm1); + } + Vectorized exp_u20() const { + return map(std::exp); + } + Vectorized fexp_u20() const { + return map(std::exp); + } + Vectorized frac() const { + return *this - this->trunc(); + } + template < + typename U = T, + typename std::enable_if_t, int> = 0> + Vectorized fmod(const Vectorized& q) const { + // U is for SFINAE purposes only. Make sure it is not changed. + static_assert(std::is_same_v, "U must be T"); + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = std::fmod(values[i], q[i]); + } + return ret; + } + Vectorized log() const { + return map(std::log); + } + Vectorized log10() const { + return map(std::log10); + } + Vectorized log1p() const { + return map(std::log1p); + } + template < + typename other_t_log2 = T, + typename std::enable_if_t::value, int> = 0> + Vectorized log2() const { + // other_t_log2 is for SFINAE and clarity. Make sure it is not changed. + static_assert(std::is_same_v, "other_t_log2 must be T"); + return map(std::log2); + } + template < + typename complex_t_log2 = T, + typename std::enable_if_t::value, int> = + 0> + Vectorized log2() const { + // complex_t_log2 is for SFINAE and clarity. Make sure it is not changed. + static_assert( + std::is_same_v, "complex_t_log2 must be T"); + const T log_2 = T(std::log(2.0)); + return Vectorized(map(std::log)) / Vectorized(log_2); + } + Vectorized ceil() const { + return map(at::native::ceil_impl); + } + Vectorized cos() const { + return map(std::cos); + } + Vectorized cosh() const { + return map(std::cosh); + } + Vectorized floor() const { + return map(at::native::floor_impl); + } + Vectorized hypot(const Vectorized& b) const { + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = std::hypot(values[i], b[i]); + } + return ret; + } + Vectorized i0() const { + return map(calc_i0); + } + Vectorized i0e() const { + return map(calc_i0e); + } + Vectorized digamma() const { + return map(calc_digamma); + } + Vectorized igamma(const Vectorized& x) const { + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = calc_igamma(values[i], x[i]); + } + return ret; + } + Vectorized igammac(const Vectorized& x) const { + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = calc_igammac(values[i], x[i]); + } + return ret; + } + Vectorized neg() const { + // NB: the trailing return type is needed because we need to coerce the + // return value back to T in the case of unary operator- incurring a + // promotion + return map([](T x) -> T { return -x; }); + } + Vectorized nextafter(const Vectorized& b) const { + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = std::nextafter(values[i], b[i]); + } + return ret; + } + Vectorized round() const { + // We do not use std::round because we would like to round midway numbers to + // the nearest even integer. + return map(at::native::round_impl); + } + Vectorized sin() const { + return map(std::sin); + } + Vectorized sinh() const { + return map(std::sinh); + } + Vectorized tan() const { + return map(std::tan); + } + Vectorized tanh() const { + return map(std::tanh); + } + Vectorized trunc() const { + return map(at::native::trunc_impl); + } + Vectorized lgamma() const { + return map(std::lgamma); + } + Vectorized sqrt() const { + return map(std::sqrt); + } + Vectorized reciprocal() const { + return map([](T x) { return (T)1 / x; }); + } + Vectorized rsqrt() const { + return map([](T x) { return (T)1 / std::sqrt(x); }); + } + Vectorized pow(const Vectorized& exp) const { + Vectorized ret; + for (const auto i : c10::irange(size())) { + ret[i] = std::pow(values[i], exp[i]); + } + return ret; + } + T reduce_add() const { + return reduce([](T x, T y) -> T { return x + y; }); + } + T reduce_max() const { + return reduce(std::max); + } + + private: + template + inline Vectorized binary_pred(const Vectorized& other, Op op) const { + // All bits are set to 1 if the pred is true, otherwise 0. + Vectorized vector; + for (int64_t i = 0; i != size(); i++) { + if (op(values[i], other.values[i])) { + std::memset(static_cast(vector.values + i), 0xFF, sizeof(T)); + } else { + std::memset(static_cast(vector.values + i), 0, sizeof(T)); + } + } + return vector; + } + + public: + Vectorized operator==(const Vectorized& other) const { + return binary_pred(other, std::equal_to()); + } + Vectorized operator!=(const Vectorized& other) const { + return binary_pred(other, std::not_equal_to()); + } + Vectorized operator>=(const Vectorized& other) const { + return binary_pred(other, std::greater_equal()); + } + Vectorized operator<=(const Vectorized& other) const { + return binary_pred(other, std::less_equal()); + } + Vectorized operator>(const Vectorized& other) const { + return binary_pred(other, std::greater()); + } + Vectorized operator<(const Vectorized& other) const { + return binary_pred(other, std::less()); + } + + private: + template + inline Vectorized binary_pred_bool(const Vectorized& other, Op op) + const { + // 1 if the pred is true, otherwise 0. + Vectorized vector; + for (int i = 0; i != size(); ++i) { + vector[i] = static_cast(op(values[i], other.values[i])); + } + return vector; + } + + public: + Vectorized eq(const Vectorized& other) const { + return binary_pred_bool(other, std::equal_to()); + } + Vectorized ne(const Vectorized& other) const { + return binary_pred_bool(other, std::not_equal_to()); + } + Vectorized gt(const Vectorized& other) const { + return binary_pred_bool(other, std::greater()); + } + Vectorized ge(const Vectorized& other) const { + return binary_pred_bool(other, std::greater_equal()); + } + Vectorized lt(const Vectorized& other) const { + return binary_pred_bool(other, std::less()); + } + Vectorized le(const Vectorized& other) const { + return binary_pred_bool(other, std::less_equal()); + } +}; + +template +Vectorized inline operator-(const Vectorized& a) { + return a.neg(); +} + +// There is an implicit conversion that would make this work if +// these operators weren't template functions, but they are template +// functions (and can't be moved to be non-member friends defined in +// the class body as suggested in +// https://stackoverflow.com/questions/9787593/implicit-type-conversion-with-template/9788255#9788255 +// because we have a lot of disparate specializations of +// Vectorized). So, just explicitly make scalars work. +#define VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(name) \ + template \ + Vectorized inline name(const Vectorized& a, T b) { \ + return name(a, Vectorized(b)); \ + } \ + template \ + Vectorized inline name(T a, const Vectorized& b) { \ + return name(Vectorized(a), b); \ + } +#define VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(op) \ + VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(operator op) + +template +Vectorized inline operator+(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] + b[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(+) + +template +Vectorized inline operator-(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] - b[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(-) + +template +Vectorized inline operator*(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] * b[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(*) + +template +Vectorized inline operator/(const Vectorized& a, const Vectorized& b) + __ubsan_ignore_float_divide_by_zero__ { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] / b[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(/) + +template , int> = 0> +Vectorized inline operator%(const Vectorized& a, const Vectorized& b) + __ubsan_ignore_float_divide_by_zero__ { + return a - a / b * b; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(%) + +template +Vectorized inline operator||( + const Vectorized& a, + const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] || b[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(||) + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = (a[i] > b[i]) ? a[i] : b[i]; + if (_isnan(a[i])) { + // If either input is NaN, propagate a NaN. + // NOTE: The case where b[i] was NaN is handled correctly by the naive + // ternary operator above. + c[i] = a[i]; + } + } + return c; +} + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = (std::abs(a[i]) > std::abs(b[i])) ? a[i] : b[i]; + if (_isnan(a[i])) { + // If either input is NaN, propagate a NaN. + // NOTE: The case where b[i] was NaN is handled correctly by the naive + // ternary operator above. + c[i] = a[i]; + } + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(maximum) + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = (a[i] < b[i]) ? a[i] : b[i]; + if (_isnan(a[i])) { + // If either input is NaN, propagate a NaN. + // NOTE: The case where b[i] was NaN is handled correctly by the naive + // ternary operator above. + c[i] = a[i]; + } + } + return c; +} + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline minimum(const Vectorized& a, const Vectorized& b) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = (std::abs(a[i]) < std::abs(b[i])) ? a[i] : b[i]; + if (_isnan(a[i])) { + // If either input is NaN, propagate a NaN. + // NOTE: The case where b[i] was NaN is handled correctly by the naive + // ternary operator above. + c[i] = a[i]; + } + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(minimum) + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min_vec, + const Vectorized& max_vec) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = std::min(std::max(a[i], min_vec[i]), max_vec[i]); + } + return c; +} + +#define VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(name) \ + template \ + Vectorized inline name( \ + const Vectorized& a, const Vectorized& b, T c) { \ + return name(a, b, Vectorized(c)); \ + } \ + \ + template \ + Vectorized inline name( \ + const Vectorized& a, T b, const Vectorized& c) { \ + return name(a, Vectorized(b), c); \ + } \ + \ + template \ + Vectorized inline name(const Vectorized& a, T b, T c) { \ + return name(a, Vectorized(b), Vectorized(c)); \ + } \ + \ + template \ + Vectorized inline name( \ + T a, const Vectorized& b, const Vectorized& c) { \ + return name(Vectorized(a), b, c); \ + } \ + \ + template \ + Vectorized inline name(T a, const Vectorized& b, T c) { \ + return name(Vectorized(a), b, Vectorized(c)); \ + } \ + \ + template \ + Vectorized inline name(T a, T b, const Vectorized& c) { \ + return name(Vectorized(a), Vectorized(b), c); \ + } + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(clamp) + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max_vec) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] > max_vec[i] ? max_vec[i] : a[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(clamp_max) + +template < + class T, + typename std::enable_if_t::value, int> = 0> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min_vec) { + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + c[i] = a[i] < min_vec[i] ? min_vec[i] : a[i]; + } + return c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(clamp_min) + +struct Vectorizedi; + +#if defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) +template +static inline Vectorized bitwise_binary_op( + const Vectorized& a, + const Vectorized& b, + Op op) { + int_vector buffer; +#if defined(CPU_CAPABILITY_AVX2) + int_vector a_buffer = + _mm256_load_si256(reinterpret_cast((const T*)a)); + int_vector b_buffer = + _mm256_load_si256(reinterpret_cast((const T*)b)); +#elif defined(CPU_CAPABILITY_AVX512) + int_vector a_buffer = + _mm512_load_si512(reinterpret_cast((const T*)a)); + int_vector b_buffer = + _mm512_load_si512(reinterpret_cast((const T*)b)); +#endif + buffer = op(a_buffer, b_buffer); + __at_align__ T results[Vectorized::size()]; + +#if defined(CPU_CAPABILITY_AVX2) + _mm256_store_si256(reinterpret_cast(results), buffer); +#elif defined(CPU_CAPABILITY_AVX512) + _mm512_store_si512(reinterpret_cast(results), buffer); +#endif + return Vectorized::loadu(results); +} + +template < + class T, + typename std::enable_if_t< + !std::is_base_of>::value, + int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + // We enclose _mm512_and_si512 or _mm256_and_si256 with lambda because it is + // always_inline +#if defined(CPU_CAPABILITY_AVX2) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm256_and_si256(a, b); }); +#elif defined(CPU_CAPABILITY_AVX512) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm512_and_si512(a, b); }); +#endif +} +template < + class T, + typename std::enable_if_t< + !std::is_base_of>::value, + int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + // We enclose _mm512_or_si512 or _mm256_or_si256 with lambda because it is + // always_inline +#if defined(CPU_CAPABILITY_AVX2) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm256_or_si256(a, b); }); +#elif defined(CPU_CAPABILITY_AVX512) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm512_or_si512(a, b); }); +#endif +} +template < + class T, + typename std::enable_if_t< + !std::is_base_of>::value, + int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + // We enclose _mm512_xor_si512 or _mm256_xor_si256 with lambda because it is + // always_inline +#if defined(CPU_CAPABILITY_AVX2) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm256_xor_si256(a, b); }); +#elif defined(CPU_CAPABILITY_AVX512) + return bitwise_binary_op( + a, b, [](int_vector a, int_vector b) { return _mm512_xor_si512(a, b); }); +#endif +} + +#else + +template +auto load(char const* data) -> T { + T ret; + std::memcpy(&ret, data, sizeof(ret)); + return ret; +} + +template +static inline Vectorized bitwise_binary_op( + const Vectorized& a, + const Vectorized& b, + Op op) { + static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t); + __at_align__ intmax_t buffer[element_no]; + static_assert( + VECTOR_WIDTH % sizeof(intmax_t) == 0, + "VECTOR_WIDTH not a multiple of sizeof(intmax_t)"); + static_assert( + sizeof(buffer) == sizeof(Vectorized), + "sizeof(buffer) must match sizeof(Vectorized)"); + // We should be using memcpy in order to respect the strict aliasing rule + // see: https://github.com/pytorch/pytorch/issues/66119 + // Using char* is defined in the C11 standard 6.5 Expression paragraph 7 + // (http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1570.pdf) + const auto* a_data = a.as_bytes(); + const auto* b_data = b.as_bytes(); + // load each intmax_t chunk and process; increase pointers by sizeof(intmax_t) + for (auto& out : buffer) { + out = op(load(a_data), load(b_data)); + a_data += sizeof(intmax_t); + b_data += sizeof(intmax_t); + } + assert(a_data == a.as_bytes() + sizeof(a)); + assert(b_data == b.as_bytes() + sizeof(b)); + return Vectorized::loadu(buffer); +} + +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator&(const Vectorized& a, const Vectorized& b) { + return bitwise_binary_op(a, b, std::bit_and()); +} +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator|(const Vectorized& a, const Vectorized& b) { + return bitwise_binary_op(a, b, std::bit_or()); +} +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { + return bitwise_binary_op(a, b, std::bit_xor()); +} + +#endif // defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512) + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(&) +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(|) +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(^) + +template < + class T, + typename std:: + enable_if_t>, int> = 0> +inline Vectorized operator~(const Vectorized& a) { + using int_t = int_same_size_t; + Vectorized ones(c10::bit_cast((int_t)(~(int_t)0))); // All bits are 1 + return a ^ ones; +} + +template +Vectorized inline operator<<( + const Vectorized& a, + const Vectorized& b) { + constexpr T max_shift = sizeof(T) * CHAR_BIT; + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + T shift = b[i]; + if ((static_cast>(shift) < 0) || + (shift >= max_shift)) { + c[i] = 0; + } else { + c[i] = static_cast>(a[i]) << shift; + } + } + return c; +} + +template +Vectorized inline operator>>( + const Vectorized& a, + const Vectorized& b) { + // right shift value to retain sign bit for signed and no bits for unsigned + constexpr T max_shift = sizeof(T) * CHAR_BIT - std::is_signed_v; + Vectorized c; + for (int i = 0; i != Vectorized::size(); i++) { + T shift = b[i]; + if ((static_cast>(shift) < 0) || + (shift >= max_shift)) { + c[i] = a[i] >> max_shift; + } else { + c[i] = a[i] >> shift; + } + } + return c; +} + +template +inline Vectorized& operator+=(Vectorized& a, const Vectorized& b) { + a = a + b; + return a; +} +template +inline Vectorized& operator-=(Vectorized& a, const Vectorized& b) { + a = a - b; + return a; +} +template +inline Vectorized& operator/=(Vectorized& a, const Vectorized& b) { + a = a / b; + return a; +} +template +inline Vectorized& operator%=(Vectorized& a, const Vectorized& b) { + a = a % b; + return a; +} +template +inline Vectorized& operator*=(Vectorized& a, const Vectorized& b) { + a = a * b; + return a; +} + +template +inline Vectorized& operator<<=(Vectorized& a, const Vectorized& b) { + a = a << b; + return a; +} + +template +inline Vectorized& operator>>=(Vectorized& a, const Vectorized& b) { + a = a >> b; + return a; +} + +template +inline Vectorized fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return a * b + c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmadd) + +template +inline Vectorized fnmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return -(a * b) + c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmadd) + +template +inline Vectorized fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return a * b - c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fmsub) + +template +inline Vectorized fnmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + return -(a * b) - c; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC(fnmsub) + +template +Vectorized inline operator&&( + const Vectorized& a, + const Vectorized& b) { + Vectorized ret; + for (int i = 0; i != Vectorized::size(); i++) { + ret[i] = a[i] && b[i]; + } + return ret; +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP(&&) + +template +std::enable_if_t< + scale == 1 || scale == 2 || scale == 4 || scale == 8, + Vectorized< + T>> inline gather(T const* base_addr, const Vectorized>& vindex) { + static constexpr int size = Vectorized::size(); + int_same_size_t index_arr[size]; + vindex.store(static_cast(index_arr)); + T buffer[size]; + for (const auto i : c10::irange(size)) { + buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; + } + return Vectorized::loadu(static_cast(buffer)); +} + +template +std:: + enable_if_t> inline mask_gather( + const Vectorized& src, + T const* base_addr, + const Vectorized>& vindex, + Vectorized& mask) { + static constexpr int size = Vectorized::size(); + T src_arr[size]; + int_same_size_t mask_arr[size]; // use int type so we can logical and + int_same_size_t index_arr[size]; + src.store(static_cast(src_arr)); + mask.store(static_cast(mask_arr)); + vindex.store(static_cast(index_arr)); + T buffer[size]; + for (const auto i : c10::irange(size)) { + if (mask_arr[i] & 0x01) { // check highest bit + buffer[i] = base_addr[index_arr[i] * scale / sizeof(T)]; + } else { + buffer[i] = src_arr[i]; + } + } + mask = Vectorized(static_cast(0)); // "zero out" mask + return Vectorized::loadu(static_cast(buffer)); +} + +// Cast a given vector to another type without changing the bits representation. +// So a Vectorized of 512 bits containing all ones can be cast to a +// Vectorized of 512 bits containing all ones (i.e., eight negative +// 1s). A Vec of 256 bits containing all ones can be cast to a +// Vec of 256 bits containing all ones (i.e., four negative 1s). +// There is a struct here because we don't have static_if and I can't +// partially specialize a templated function. +template +struct CastImpl { + static inline Vectorized apply(const Vectorized& src) { + src_t src_arr[Vectorized::size()]; + src.store(static_cast(src_arr)); + return Vectorized::loadu(static_cast(src_arr)); + } +}; + +template +struct CastImpl { + static inline Vectorized apply(const Vectorized& src) { + return src; + } +}; + +template +inline Vectorized cast(const Vectorized& src) { + return CastImpl::apply(src); +} + +template > +inline Vectorized convert_to_int_of_same_size( + const Vectorized& src) { + static_assert(sizeof(T) == sizeof(IntType)); + static constexpr int size = Vectorized::size(); + + std::array src_arr = {}; + src.store(static_cast(src_arr.data())); + std::array buffer; + std::transform( + src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const T& x) { + return static_cast(x); + }); + return Vectorized::loadu(static_cast(buffer.data())); +} + +template > +inline Vectorized convert_to_fp_of_same_size( + const Vectorized& src) { + static_assert(sizeof(T) == sizeof(IntType)); + static constexpr int size = Vectorized::size(); + + std::array src_arr; + src.store(static_cast(src_arr.data())); + std::array buffer; + std::transform( + src_arr.cbegin(), src_arr.cend(), buffer.begin(), [](const IntType& x) { + return static_cast(x); + }); + return Vectorized::loadu(static_cast(buffer.data())); +} + +// clang-format off +// Example inputs for AVX512: +// a Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} +// b Vectorized = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15} +// returns: +// Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15} +// Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} +// Example inputs for AVX2: a Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3} +// b Vectorized = {a4, b4, a5, b5, a6, b6, a7, b7} +// returns: Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7} +// Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7} +// clang-format on +template +inline std::enable_if_t< + Vectorized::size() % 2 == 0, + std::pair, Vectorized>> +deinterleave2(const Vectorized& a, const Vectorized& b) { + static constexpr int size = Vectorized::size(); + static constexpr int half_size = size / 2; + T a_arr[size]; + T b_arr[size]; + T buffer1[size]; + T buffer2[size]; + a.store(static_cast(a_arr)); + b.store(static_cast(b_arr)); + for (const auto i : c10::irange(half_size)) { + buffer1[i] = a_arr[i * 2]; + buffer1[half_size + i] = b_arr[i * 2]; + buffer2[i] = a_arr[i * 2 + 1]; + buffer2[half_size + i] = b_arr[i * 2 + 1]; + } + return std::make_pair( + Vectorized::loadu(static_cast(buffer1)), + Vectorized::loadu(static_cast(buffer2))); +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(deinterleave2) + +// clang-format off +// inverse operation of deinterleave2 +// Example inputs for AVX512: +// a Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15} +// b Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15} +// returns, for AVX512: +// Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} +// Vectorized = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15} +// Example inputs for AVX2 : a Vectorized = {a0, a1, a2, a3, a4, a5, a6, a7} +// b Vectorized = {b0, b1, b2, b3, b4, b5, b6, b7} +// returns: Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3} +// Vectorized = {a4, b4, a5, b5, a6, b6, a7, b7} +// clang-format on +template +inline std::enable_if_t< + Vectorized::size() % 2 == 0, + std::pair, Vectorized>> +interleave2(const Vectorized& a, const Vectorized& b) { + static constexpr int size = Vectorized::size(); + static constexpr int half_size = size / 2; + T a_arr[size]; + T b_arr[size]; + T buffer1[size]; + T buffer2[size]; + a.store(static_cast(a_arr)); + b.store(static_cast(b_arr)); + for (const auto i : c10::irange(half_size)) { + buffer1[i * 2] = a_arr[i]; + buffer1[i * 2 + 1] = b_arr[i]; + buffer2[i * 2] = a_arr[half_size + i]; + buffer2[i * 2 + 1] = b_arr[half_size + i]; + } + return std::make_pair( + Vectorized::loadu(static_cast(buffer1)), + Vectorized::loadu(static_cast(buffer2))); +} + +VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC(interleave2) + +#undef VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_FUNC +#undef VECTORIZED_SUPPORT_SCALARS_FOR_BINARY_OP +#undef VECTORIZED_SUPPORT_SCALARS_FOR_TERNARY_FUNC + +template +inline void convert(const src_T* src, dst_T* dst, int64_t n) { +#ifndef _MSC_VER +#pragma unroll +#endif + for ([[maybe_unused]] const auto i : c10::irange(n)) { + *dst = c10::convert(c10::load(src)); + src++; + dst++; + } +} + +template +inline Vectorized flip(const Vectorized& data) { + static constexpr int size = Vectorized::size(); + T output[size]; + T buffer[size]; + data.store(static_cast(buffer)); + for (const auto i : c10::irange(size)) { + output[i] = buffer[size - i - 1]; + } + return Vectorized::loadu(static_cast(output)); +} + +// Transpose the `src` buffer of type `T` and size (M,N) into the `dst` buffer. +// `ld_src` is the leading dimension of `src` and `ld_dst` is the leading +// dimension of `dst`. +template +inline void transpose_mxn( + const T* src, + int64_t ld_src, + T* dst, + int64_t ld_dst, + int M, + int N) { + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + dst[j * ld_dst + i] = src[i * ld_src + j]; + } + } +} + +template +inline void transpose_mxn( + const T* src, + int64_t ld_src, + T* dst, + int64_t ld_dst) { + transpose_mxn(src, ld_src, dst, ld_dst, M, N); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +// additional headers for more operations that depend on vec_base +#include +#include +#include + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_convert.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_convert.h new file mode 100644 index 0000000000000000000000000000000000000000..bdeeb6aae83470a41f9a238a726e74e6d68e80c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_convert.h @@ -0,0 +1,84 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + typename Enabled = void> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + constexpr int count = std::min( + VectorizedN::size(), VectorizedN::size()); + __at_align__ src_t src_buf[VectorizedN::size()]; + src.store(src_buf); + __at_align__ dst_t dst_buf[VectorizedN::size()]; + for (int i = 0; i < count; i++) { + dst_buf[i] = static_cast(src_buf[i]); + } + return VectorizedN::loadu(dst_buf, count); + } +}; + +template +inline std::enable_if_t, Vectorized> convert( + const Vectorized& src) { + return src; +} + +template +inline std::enable_if_t, Vectorized> +convert(const Vectorized& src) { + return VecConvert::apply(src); +} + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + std::enable_if_t = 0> +inline VectorizedN convert(const VectorizedN& src) { + return VecConvert::apply(src); +} + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + bool keep = false, + std::enable_if_t = 0> +inline std::conditional_t, Vectorized> +convert(const VectorizedN& src) { + return VecConvert::apply(src); +} + +} // namespace CPU_CAPABILITY + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline std::tuple, Vectorized> convert_to_float( + const Vectorized&); + +template < + typename scalar_t, + typename std::enable_if_t, int> = 0> +inline Vectorized convert_from_float( + const Vectorized&, + const Vectorized&); + +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_half.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_half.h new file mode 100644 index 0000000000000000000000000000000000000000..0d5395ca15d6fbeaaf1c46b16bca2fd3382c9f8c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_half.h @@ -0,0 +1,123 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +// Transpose a [2, 32] matrix to [32, 2] +// Note: the output leading dimension should be 2, +// that is, the output must be contiguous +template > +static inline void transpose_pad_2x32_block( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int krem = 2, + int nrem = 32) { +#if defined(CPU_CAPABILITY_AVX512) + __m512i r0, r1; + __m512i d0, d1; + // load + if (nrem < 32) { + __mmask32 mask_krem_v = (1LL << nrem) - 1; + r0 = _mm512_maskz_loadu_epi16(mask_krem_v, src); + // if krem is not 2, pad with zeros + if (krem == 2) { + r1 = _mm512_maskz_loadu_epi16(mask_krem_v, src + ld_src); + } else { + r1 = _mm512_setzero_si512(); + } + } else { + r0 = _mm512_loadu_si512(reinterpret_cast(src)); + if (krem == 2) { + r1 = _mm512_loadu_si512(reinterpret_cast(src + ld_src)); + } else { + r1 = _mm512_setzero_si512(); + } + } + // transpose + d0 = _mm512_unpacklo_epi16(r0, r1); + d1 = _mm512_unpackhi_epi16(r0, r1); + r0 = _mm512_shuffle_i32x4(d0, d1, 0x88); + r1 = _mm512_shuffle_i32x4(d0, d1, 0xdd); + d0 = _mm512_shuffle_i32x4(r0, r1, 0x88); + d1 = _mm512_shuffle_i32x4(r0, r1, 0xdd); + + // store + if (nrem < 16) { + __mmask32 mask_rem_v = (1LL << (nrem * 2)) - 1; + _mm512_mask_storeu_epi16(dst, mask_rem_v, d0); + } else if (nrem == 16) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0); + } else if (nrem < 32) { + __mmask32 mask_rem_v = (1LL << (nrem * 2 - 32)) - 1; + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0); + _mm512_mask_storeu_epi16( + reinterpret_cast<__m512i*>(dst + 32), mask_rem_v, d1); + } else { + // normal store + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), d0); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 32), d1); + } +#else + TORCH_CHECK( + false, + "transpose_pad_2x32_block is only supported when avx512 is supported") +#endif +} + +// To use AMX to accelerate GEMM, +// reorder the memory format [K, N] -> [K/2, N, 2] +// Note: If K % 2 != 0, pad K implicitly +template > +static inline void pack_vnni2( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int64_t K, + int64_t N) { +#if defined(CPU_CAPABILITY_AVX512) + int64_t bk = 0; + int64_t _K = K / 2 * 2; + int64_t _N = N / 32 * 32; + for (; bk < _K; bk += 2) { + int64_t bn = 0; + for (; bn < _N; bn += 32) { + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src); + } + int64_t nrem = N - bn; + if (nrem > 0) { + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 2, nrem); + } + } + if (K % 2 == 1) { + int64_t bn = 0; + for (; bn < _N; bn += 32) { + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1); + } + int64_t nrem = N - bn; + if (nrem > 0) { + transpose_pad_2x32_block( + src + bk * ld_src + bn, dst + bk * N + bn * 2, ld_src, 1, nrem); + } + } +#else + TORCH_CHECK(false, "pack_vnni2 is only supported when avx512 is supported") +#endif +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_mask.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_mask.h new file mode 100644 index 0000000000000000000000000000000000000000..509e79cfd16c12d1f66edde31eed0f114ca40f8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_mask.h @@ -0,0 +1,318 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +namespace at::vec { +inline namespace CPU_CAPABILITY { + +/** + * The `VecMask` class provides a convenient interface for working with + * vectorized masks in SIMD operations. It encapsulates a `Vectorized` + * mask that can be directly usable in masked vectorized operations. It provides + * various methods for manipulating and accessing the mask elements: + * 1. `from` and `to`: Conversion between a vector of boolean values and a + * vectorized mask. + * 2. `cast`: Casts the mask to a different base type. + * 3. `all_zero`: Checks if all mask elements are zero. + * 4. `is_masked`: Checks if a specific element is masked. + * 5. `loadu`: Loads data from memory using the mask. + * 6. `all_masked`: Checks if all mask elements are masked. + * + * Some helper template classes are provided to simplify the specialization of + * the `VecMask` for the specific CPU arch: + * 1. `VecMaskLoad`: Loads data from memory using the mask. + * 2. `VecMaskTo`: Converts the mask to boolean. + * 3. `VecMaskCast`: Casts the mask to a different base type. + * + */ +template +class VecMask; + +template < + typename data_t, + int data_n, + typename mask_t, + int mask_n, + typename Enabled = void> +struct VecMaskLoad { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + constexpr typename VecMask::size_type size = + VecMask::size(); + static_assert(VectorizedN::size() >= size); + __at_align__ data_t data[size]; + __at_align__ mask_t mask[size]; + auto mask_ = VectorizedN(vec_mask); + mask_.store(mask); + for (int i = 0; i < size; i++) { + data[i] = mask[i] ? ptr[i] : static_cast(0); + } + return VectorizedN::loadu(data, size); + } +}; + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + typename Enabled = void> +struct VecMaskTo { + static inline VecMask apply( + const VecMask& vec_mask) { + auto zeros = VectorizedN(static_cast(0)); + auto ones = VectorizedN(static_cast(1)); + return VectorizedN::blendv( + zeros, ones, vec_mask.template cast()); + } +}; + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + typename Enabled = void> +struct VecMaskCast { + static inline VecMask apply( + const VecMask& vec_mask) { + return VecMask::from(VectorizedN(vec_mask)); + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + return vec_mask; + } +}; + +template +struct VecMaskCheck { + static inline bool all_zero(const VectorizedN& vec_mask) { + __at_align__ T mask[VectorizedN::size()]; + vec_mask.store(mask); + return std::all_of(mask, mask + VectorizedN::size(), [](T m) { + return m == static_cast(0); + }); + } + + static inline bool all_masked(const VectorizedN& vec_mask) { + __at_align__ T mask[VectorizedN::size()]; + vec_mask.store(mask); + return std::all_of(mask, mask + VectorizedN::size(), [](T m) { + return m != static_cast(0); + }); + } + + static inline bool is_masked(const VectorizedN& vec_mask, int i) { + __at_align__ T mask[VectorizedN::size()]; + vec_mask.store(mask); + return mask[i] != static_cast(0); + } +}; + +template +class VecMask { + public: + using size_type = int; + static constexpr size_type size() { + return VectorizedN::size(); + } + + private: + VectorizedN mask_; + + public: + VecMask() : mask_(static_cast(0)) {} + VecMask(const VectorizedN& mask) : mask_(mask) {} + + template = 0> + VecMask(const Vectorized& mask) : mask_(mask) {} + + template + static VecMask from(const VectorizedN& b_vec) { + __at_align__ U b_buf[size()]; + if constexpr (size() >= VectorizedN::size()) { + b_vec.store(b_buf); + for (int i = VectorizedN::size(); i < size(); i++) { + b_buf[i] = static_cast(0); + } + } else { + b_vec.store(b_buf, size()); + } + return from(b_buf); + } + + template + static VecMask from(U b) { + using int_t = int_same_size_t; + T mask = b ? c10::bit_cast((int_t)(~(int_t)0)) : (T)0; + return VectorizedN(mask); + } + + template + static VecMask from(U* b) { + using int_t = int_same_size_t; + __at_align__ T mask[size()]; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (int i = 0; i < size(); i++) { + *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0; + } + return VectorizedN(VectorizedN::loadu(mask)); + } + + template + static VecMask from(U* b, int count) { + using int_t = int_same_size_t; + __at_align__ T mask[size()]; +#ifndef __msvc_cl__ +#pragma unroll +#endif + for (int i = 0; i < count; i++) { + *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0; + } + return VectorizedN(VectorizedN::loadu(mask, count)); + } + + static VecMask blendv( + const VecMask& c, + const VecMask& b, + const VecMask& a) { + VectorizedN result = VectorizedN::blendv( + VectorizedN(c), VectorizedN(b), VectorizedN(a)); + return result; + } + + static VecMask set( + const VecMask& a, + const VecMask& b, + int64_t count = size()) { + VectorizedN result = VectorizedN::set( + VectorizedN(a), VectorizedN(b), count); + return result; + } + + void store(bool* b, int count = size()) { + constexpr int L = + (VectorizedN::size() + Vectorized::size() - 1) / + Vectorized::size(); + auto res = this->to(); + res.store(b, count); + return; + } + + template = 2, int> = 0> + inline VectorizedN to() const { + return VecMaskTo::apply(*this); + } + + template = 0> + inline Vectorized to() const { + return VecMaskTo::apply(*this); + } + + template + inline VecMask cast() const { + return VecMaskCast::apply(*this); + } + + inline bool all_zero() const { + return VecMaskCheck::all_zero(mask_); + } + + inline bool all_masked() const { + return VecMaskCheck::all_masked(mask_); + } + + inline bool is_masked(int i) const { + return VecMaskCheck::is_masked(mask_, i); + } + + inline operator VectorizedN() const { + return mask_; + } + + template = 0> + inline operator Vectorized() const { + return mask_[0]; + } + + inline Vectorized operator[](int i) const { + return mask_[i]; + } + + template < + typename U, + int L, + std::enable_if_t= 2 && VectorizedN::size() >= size(), int> = 0> + VectorizedN loadu(const U* ptr) const { + return VecMaskLoad::apply(ptr, *this); + } + + template < + typename U, + int L, + std::enable_if_t::size() >= size(), int> = 0> + Vectorized loadu(const U* ptr) const { + return VecMaskLoad::apply(ptr, *this); + } +}; + +#define VEC_MASK_DEFINE_UNARY_OP_GLOBAL(op) \ + template \ + inline VecMask op(const VecMask& a) { \ + return op(VectorizedN(a)); \ + } + +#define VEC_MASK_DEFINE_BINARY_OP_GLOBAL(op) \ + template < \ + typename T, \ + int N, \ + typename V, \ + int M, \ + std::enable_if_t::size() == VecMask::size(), int> = \ + 0> \ + inline VecMask op(const VecMask& a, const VecMask& b) { \ + return op( \ + VectorizedN(a), VectorizedN(b.template cast())); \ + } + +#define VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(op, EXPR) \ + template < \ + typename T, \ + int N, \ + typename V, \ + int M, \ + std::enable_if_t::size() == VecMask::size(), int> = \ + 0> \ + inline VecMask op(const VecMask& a, const VecMask& b) { \ + return EXPR; \ + } + +VEC_MASK_DEFINE_UNARY_OP_GLOBAL(operator~) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator&) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator|) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator^) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator*) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>, a & ~b) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b)) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b)) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b)) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator!=, (a ^ b)) + +#undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL +#undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL +#undef VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_n.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_n.h new file mode 100644 index 0000000000000000000000000000000000000000..5e7ed2de74177d868f3c11ef36a49f79986e2bc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_n.h @@ -0,0 +1,412 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +/** + * @brief A class template representing a vectorized type with + * `N * Vectorized::size()` elements, aiming to support vectors of + * arbitrary size. A specific use case of it is to represent vectors + * converted from data types with different sizes but with the same + * number of vector elements, e.g., `VectorizedN` can be + * a vector converted from two `Vectorized`, `VectorizedN` + * can be a vector converted from two `Vectorized` etc. + * + * It supports most of the operations of `Vectorized` + * and the implementation delegates to `Vectorized` with loops over `N`. + * + * @tparam T The underlying type of the vectorized elements. + * @tparam N The number of underlying `Vectorized`. + */ +template +class VectorizedN { + public: + using value_type = T; + using size_type = int; + + static constexpr size_type size_T = sizeof(T); + static constexpr size_type size() { + return Vectorized::size() * N; + } + + private: + std::array, N> values; + + public: + // methods not implemented yet: + // variadic constructor, operator T*, as_bytes, zero_mask + +#define VECTORIZEDN_DEFINE_UNARY_OP(op) \ + VectorizedN op() const { \ + return unary_op([](const Vectorized& a) { return a.op(); }); \ + } + +#define VECTORIZEDN_DEFINE_BINARY_OP(op) \ + VectorizedN op(const VectorizedN& other) const { \ + return binary_op( \ + other, [](const Vectorized& a, const Vectorized& b) { \ + return a.op(b); \ + }); \ + } + + template + inline VectorizedN unary_op(Op op) const { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result.values[i] = op(values[i]); + } + return result; + } + + template + inline VectorizedN binary_op(const VectorizedN& other, Op op) + const { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result.values[i] = op(values[i], other.values[i]); + } + return result; + } + + template + inline VectorizedN ternary_op( + const VectorizedN& other, + const VectorizedN& other2, + Op op) const { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result.values[i] = op(values[i], other.values[i], other2.values[i]); + } + return result; + } + + VectorizedN() = default; + + explicit VectorizedN(T val) { + for (int i = 0; i < N; ++i) { + values[i] = Vectorized(val); + } + } + + template = 0> + VectorizedN(const Vectorized& val) : values({val}) {} + + template = 0> + VectorizedN(const Vectorized& val_0, const Vectorized& val_1) + : values({val_0, val_1}) {} + + template = 0> + inline operator Vectorized() const { + return values[0]; + } + + inline const Vectorized& operator[](int i) const { + return values[i]; + } + + inline Vectorized& operator[](int i) { + return values[i]; + } + + template + static VectorizedN blend( + const VectorizedN& a, + const VectorizedN& b) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = + Vectorized::template blend(a.values[i], b.values[i]); + } + return result; + } + + static VectorizedN blendv( + const VectorizedN& a, + const VectorizedN& b, + const VectorizedN& mask) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = + Vectorized::blendv(a.values[i], b.values[i], mask.values[i]); + } + return result; + } + + template + static VectorizedN arange( + T base = static_cast(0), + step_t step = static_cast(1)) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::arange(base, step); + base += step * Vectorized::size(); + } + return result; + } + + static VectorizedN set( + const VectorizedN& a, + const VectorizedN& b, + int64_t count = size()) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + if (count > 0) { + result.values[i] = Vectorized::set( + a.values[i], + b.values[i], + std::min(count, (int64_t)Vectorized::size())); + count -= Vectorized::size(); + } else { + result.values[i] = a.values[i]; + } + } + return result; + } + + static VectorizedN loadu(const void* ptr) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::loadu(ptr); + ptr = static_cast(ptr) + Vectorized::size(); + } + return result; + } + + static VectorizedN loadu(const void* ptr, int64_t count) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + if (count > 0) { + result.values[i] = Vectorized::loadu( + ptr, std::min(count, (int64_t)Vectorized::size())); + ptr = static_cast(ptr) + Vectorized::size(); + count -= Vectorized::size(); + } else { + result.values[i] = Vectorized((T)1); + } + } + return result; + } + + void store(void* ptr) const { + for (int i = 0; i < N; ++i) { + values[i].store(ptr); + ptr = static_cast(ptr) + Vectorized::size(); + } + } + + void store(void* ptr, int count) const { + for (int i = 0; i < N; ++i) { + values[i].store(ptr, std::min(count, (int)Vectorized::size())); + ptr = static_cast(ptr) + Vectorized::size(); + count -= Vectorized::size(); + if (count <= 0) { + break; + } + } + } + + bool has_inf_nan() const { + for (int i = 0; i < N; ++i) { + if (values[i].has_inf_nan()) { + return true; + } + } + return false; + } + + VectorizedN map(T (*const f)(T)) const { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = values[i].map(f); + } + return result; + } + + VectorizedN map(T (*const f)(const T&)) const { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = values[i].map(f); + } + return result; + } + + VECTORIZEDN_DEFINE_UNARY_OP(isnan) + VECTORIZEDN_DEFINE_UNARY_OP(abs) + VECTORIZEDN_DEFINE_UNARY_OP(sgn) + VECTORIZEDN_DEFINE_UNARY_OP(angle) + VECTORIZEDN_DEFINE_UNARY_OP(real) + VECTORIZEDN_DEFINE_UNARY_OP(imag) + VECTORIZEDN_DEFINE_UNARY_OP(conj) + VECTORIZEDN_DEFINE_UNARY_OP(acos) + VECTORIZEDN_DEFINE_UNARY_OP(acosh) + VECTORIZEDN_DEFINE_UNARY_OP(asin) + VECTORIZEDN_DEFINE_UNARY_OP(asinh) + VECTORIZEDN_DEFINE_UNARY_OP(atan) + VECTORIZEDN_DEFINE_UNARY_OP(atanh) + VECTORIZEDN_DEFINE_BINARY_OP(atan2) + VECTORIZEDN_DEFINE_BINARY_OP(copysign) + VECTORIZEDN_DEFINE_UNARY_OP(erf) + VECTORIZEDN_DEFINE_UNARY_OP(erfc) + VECTORIZEDN_DEFINE_UNARY_OP(erfinv) + VECTORIZEDN_DEFINE_UNARY_OP(exp) + VECTORIZEDN_DEFINE_UNARY_OP(exp2) + VECTORIZEDN_DEFINE_UNARY_OP(expm1) + VECTORIZEDN_DEFINE_UNARY_OP(exp_u20) + VECTORIZEDN_DEFINE_UNARY_OP(fexp_u20) + VECTORIZEDN_DEFINE_UNARY_OP(frac) + VECTORIZEDN_DEFINE_BINARY_OP(fmod) + VECTORIZEDN_DEFINE_UNARY_OP(log) + VECTORIZEDN_DEFINE_UNARY_OP(log10) + VECTORIZEDN_DEFINE_UNARY_OP(log1p) + VECTORIZEDN_DEFINE_UNARY_OP(log2) + VECTORIZEDN_DEFINE_UNARY_OP(ceil) + VECTORIZEDN_DEFINE_UNARY_OP(cos) + VECTORIZEDN_DEFINE_UNARY_OP(cosh) + VECTORIZEDN_DEFINE_UNARY_OP(floor) + VECTORIZEDN_DEFINE_BINARY_OP(hypot) + VECTORIZEDN_DEFINE_UNARY_OP(i0) + VECTORIZEDN_DEFINE_UNARY_OP(i0e) + VECTORIZEDN_DEFINE_UNARY_OP(digamma) + VECTORIZEDN_DEFINE_BINARY_OP(igamma) + VECTORIZEDN_DEFINE_BINARY_OP(igammac) + VECTORIZEDN_DEFINE_UNARY_OP(neg) + VECTORIZEDN_DEFINE_BINARY_OP(nextafter) + VECTORIZEDN_DEFINE_UNARY_OP(round) + VECTORIZEDN_DEFINE_UNARY_OP(sin) + VECTORIZEDN_DEFINE_UNARY_OP(sinh) + VECTORIZEDN_DEFINE_UNARY_OP(tan) + VECTORIZEDN_DEFINE_UNARY_OP(tanh) + VECTORIZEDN_DEFINE_UNARY_OP(trunc) + VECTORIZEDN_DEFINE_UNARY_OP(lgamma) + VECTORIZEDN_DEFINE_UNARY_OP(sqrt) + VECTORIZEDN_DEFINE_UNARY_OP(reciprocal) + VECTORIZEDN_DEFINE_UNARY_OP(rsqrt) + VECTORIZEDN_DEFINE_BINARY_OP(pow) + VECTORIZEDN_DEFINE_BINARY_OP(operator==) + VECTORIZEDN_DEFINE_BINARY_OP(operator!=) + VECTORIZEDN_DEFINE_BINARY_OP(operator>=) + VECTORIZEDN_DEFINE_BINARY_OP(operator<=) + VECTORIZEDN_DEFINE_BINARY_OP(operator>) + VECTORIZEDN_DEFINE_BINARY_OP(operator<) + VECTORIZEDN_DEFINE_BINARY_OP(eq) + VECTORIZEDN_DEFINE_BINARY_OP(ne) + VECTORIZEDN_DEFINE_BINARY_OP(gt) + VECTORIZEDN_DEFINE_BINARY_OP(ge) + VECTORIZEDN_DEFINE_BINARY_OP(lt) + VECTORIZEDN_DEFINE_BINARY_OP(le) + +#undef VECTORIZEDN_DEFINE_UNARY_OP +#undef VECTORIZEDN_DEFINE_BINARY_OP +}; + +#define VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(op) \ + template \ + inline VectorizedN op(const VectorizedN& a) { \ + return a.unary_op([](const Vectorized& a) { return op(a); }); \ + } + +#define VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(op) \ + template \ + inline VectorizedN op( \ + const VectorizedN& a, const VectorizedN& b) { \ + return a.binary_op(b, [](const Vectorized& a, const Vectorized& b) { \ + return op(a, b); \ + }); \ + } + +#define VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(op) \ + template \ + inline VectorizedN op( \ + const VectorizedN& a, \ + const VectorizedN& b, \ + const VectorizedN& c) { \ + return a.ternary_op( \ + b, \ + c, \ + [](const Vectorized& a, \ + const Vectorized& b, \ + const Vectorized& c) { return op(a, b, c); }); \ + } + +#define VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(op) \ + template \ + inline VectorizedN& op( \ + VectorizedN& a, const VectorizedN& b) { \ + a = a.binary_op(b, [](const Vectorized& a, const Vectorized& b) { \ + return op(a, b); \ + }); \ + return a; \ + } + +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator+) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator-) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator*) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator/) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator%) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator||) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator<<) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator>>) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(maximum) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(minimum) +VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(fmadd) +VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(fmsub) +VECTORIZEDN_DEFINE_TERNARY_OP_GLOBAL(clamp) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_max) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_min) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator&) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator|) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator^) +VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(operator~) + +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator+=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator-=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator*=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator/=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator%=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator<<=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator>>=) + +#undef VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL +#undef VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL +#undef VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL + +template +inline T vec_reduce_all(const OpVec& vec_fun, VectorizedN acc_vec) { + Vectorized vec_result = acc_vec[0]; + for (int i = 1; i < N; i++) { + vec_result = vec_fun(vec_result, acc_vec[i]); + } + return vec_reduce_all(vec_fun, vec_result); +} + +template +std::ostream& operator<<(std::ostream& stream, const VectorizedN& vec_n) { + stream << "vec_n["; + for (int i = 0; i < N; ++i) { + if (i != 0) { + stream << ", "; + } + stream << vec_n[i]; + } + stream << ']'; + return stream; +} +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_quant.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_quant.h new file mode 100644 index 0000000000000000000000000000000000000000..04c81261f816eb2a1c66d7d3d3c64df2aaf43f7b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/cpu/vec/vec_quant.h @@ -0,0 +1,258 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +// Transpose a [4, 64] block to [64, 4] (with contiguous output, ld=4) +template > +static inline void transpose_pad_4x64_block( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int krem = 4, + int nrem = 64) { +#if defined(CPU_CAPABILITY_AVX512) + __m512i r[4]; + // Load with mask if partial + if (nrem < 64) { + __mmask64 mask = (1ULL << nrem) - 1; + for (int i = 0; i < krem; ++i) { + r[i] = _mm512_maskz_loadu_epi8(mask, src + i * ld_src); + } + for (int i = krem; i < 4; ++i) { + r[i] = _mm512_setzero_si512(); + } + } else { + for (int i = 0; i < krem; ++i) { + r[i] = _mm512_loadu_si512( + reinterpret_cast(src + i * ld_src)); + } + for (int i = krem; i < 4; ++i) { + r[i] = _mm512_setzero_si512(); + } + } + + // Transpose 4x64 bytes using unpack and shuffle + __m512i t0 = _mm512_unpacklo_epi8(r[0], r[1]); + __m512i t1 = _mm512_unpackhi_epi8(r[0], r[1]); + __m512i t2 = _mm512_unpacklo_epi8(r[2], r[3]); + __m512i t3 = _mm512_unpackhi_epi8(r[2], r[3]); + + __m512i u0 = _mm512_unpacklo_epi16(t0, t2); + __m512i u1 = _mm512_unpackhi_epi16(t0, t2); + __m512i u2 = _mm512_unpacklo_epi16(t1, t3); + __m512i u3 = _mm512_unpackhi_epi16(t1, t3); + + __m512i v0 = _mm512_shuffle_i32x4(u0, u1, 0x88); + __m512i v1 = _mm512_shuffle_i32x4(u0, u1, 0xdd); + __m512i v2 = _mm512_shuffle_i32x4(u2, u3, 0x88); + __m512i v3 = _mm512_shuffle_i32x4(u2, u3, 0xdd); + + __m512i r0 = _mm512_shuffle_i32x4(v0, v2, 0x88); + __m512i r1 = _mm512_shuffle_i32x4(v1, v3, 0x88); + __m512i r2 = _mm512_shuffle_i32x4(v0, v2, 0xdd); + __m512i r3 = _mm512_shuffle_i32x4(v1, v3, 0xdd); + + // Store output + if (nrem < 16) { + __mmask64 mask = (1ULL << (nrem * 4)) - 1; + _mm512_mask_storeu_epi8(dst, mask, r0); + } else if (nrem == 16) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + } else if (nrem < 32) { + int n_bytes1 = 64; + int n_bytes2 = (nrem * 4) - n_bytes1; + __mmask64 mask = (1ULL << n_bytes2) - 1; + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64), mask, r1); + } else if (nrem == 32) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1); + } else if (nrem < 48) { + int n_bytes1 = 64 * 2; + int n_bytes2 = (nrem * 4) - n_bytes1; + __mmask64 mask = (1ULL << n_bytes2) - 1; + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1); + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64 * 2), mask, r2); + } else if (nrem == 48) { + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2); + } else if (nrem < 64) { + int n_bytes1 = 64 * 3; + int n_bytes2 = (nrem * 4) - n_bytes1; + __mmask64 mask = (1ULL << n_bytes2) - 1; + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2); + _mm512_mask_storeu_epi8(reinterpret_cast<__m512i*>(dst + 64 * 3), mask, r3); + } else { + // normal case, nrem == 64 + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst), r0); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64), r1); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 2), r2); + _mm512_storeu_si512(reinterpret_cast<__m512i*>(dst + 64 * 3), r3); + } +#else + TORCH_CHECK( + false, + "transpose_pad_4x64_block is only supported when AVX-512 is supported") +#endif +} + +// Reorder [K, N] → [K/4, N, 4] (VNNI4-style layout for bit8) +template > +static inline void pack_vnni4( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int64_t K, + int64_t N) { +#if defined(CPU_CAPABILITY_AVX512) + int64_t bk = 0; + int64_t _K = K / 4 * 4; + int64_t _N = N / 64 * 64; + for (; bk < _K; bk += 4) { + int64_t bn = 0; + for (; bn < _N; bn += 64) { + transpose_pad_4x64_block( + src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src); + } + int64_t nrem = N - bn; + if (nrem > 0) { + transpose_pad_4x64_block( + src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, 4, nrem); + } + } + + // Handle leftover K rows (< 4) + if (K % 4 != 0) { + int krem = K - bk; + int64_t bn = 0; + for (; bn < _N; bn += 64) { + transpose_pad_4x64_block( + src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, krem); + } + int64_t nrem = N - bn; + if (nrem > 0) { + transpose_pad_4x64_block( + src + bk * ld_src + bn, dst + bk * N + bn * 4, ld_src, krem, nrem); + } + } +#else + TORCH_CHECK(false, "pack_vnni4 is only supported when AVX-512 is supported") +#endif +} + +// This is a helper function for transpose_pack_vnni4 +// Transform a [4, 16] block (with incontiguous output) +// Src: +// a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 a16 +// b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 b16 +// c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 c16 +// d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 d16 +// Dst: +// a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4 +// a5 a6 a7 a8 b5 b6 b7 b8 c5 c6 c7 c8 d5 d6 d7 d8 +// a9 a10 a11 a12 b9 b10 b11 b12 c9 c10 c11 c12 d9 d10 d11 d12 +// a13 a14 a15 a16 b13 b14 b15 b16 c13 c14 c15 c16 d13 d14 d15 d16 +template > +static inline void transpose_vnni4_pad_4x16_block( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int64_t ld_dst, + int krem = 4) { +#if defined(CPU_CAPABILITY_AVX512) + __m128i r[4]; + for (int i = 0; i < krem; ++i) { + r[i] = _mm_loadu_si128(reinterpret_cast(src + i * ld_src)); + } + for (int i = krem; i < 4; ++i) { + r[i] = _mm_setzero_si128(); + } + + // Transpose 4x16 bytes using unpack and shuffle + __m128i t0 = _mm_unpacklo_epi32(r[0], r[1]); + __m128i t1 = _mm_unpackhi_epi32(r[0], r[1]); + __m128i t2 = _mm_unpacklo_epi32(r[2], r[3]); + __m128i t3 = _mm_unpackhi_epi32(r[2], r[3]); + + __m128i r0 = _mm_unpacklo_epi64(t0, t2); + __m128i r1 = _mm_unpackhi_epi64(t0, t2); + __m128i r2 = _mm_unpacklo_epi64(t1, t3); + __m128i r3 = _mm_unpackhi_epi64(t1, t3); + + // Store output + if (krem == 4) { + // normal case + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), r0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst), r1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 2), r2); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + ld_dst * 3), r3); + } else { + // masked case + __mmask16 mask = (1ULL << (krem * 4)) - 1; + _mm_mask_storeu_epi8(dst, mask, r0); + _mm_mask_storeu_epi8(reinterpret_cast<__m128i*>(dst + ld_dst), mask, r1); + _mm_mask_storeu_epi8( + reinterpret_cast<__m128i*>(dst + ld_dst * 2), mask, r2); + _mm_mask_storeu_epi8( + reinterpret_cast<__m128i*>(dst + ld_dst * 3), mask, r3); + } +#else + TORCH_CHECK( + false, + "transpose_vnni4_pad_4x16_block is only supported when AVX-512 is supported") +#endif +} + +// Do the transpose packing fusion with VNNI4 +// Reorder [K, N] → [N/4, K, 4] (VNNI4-style layout for bit8) +template > +static inline void transpose_pack_vnni4( + const scalar_t* src, + scalar_t* dst, + int64_t ld_src, + int64_t K, + int64_t N) { +#if defined(CPU_CAPABILITY_AVX512) + TORCH_CHECK( + N % 16 == 0, "N needs to be multiple of 16 for transpose_pack_vnni4"); + int64_t bk = 0; + int64_t _K = K / 4 * 4; + for (; bk < _K; bk += 4) { + int64_t bn = 0; + for (; bn < N; bn += 16) { + transpose_vnni4_pad_4x16_block( + src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4); + } + } + + // Handle leftover K rows (< 4) + if (K % 4 != 0) { + int krem = K - bk; + int64_t bn = 0; + for (; bn < N; bn += 16) { + transpose_vnni4_pad_4x16_block( + src + bk * ld_src + bn, dst + bn * K + bk * 4, ld_src, K * 4, krem); + } + } +#else + TORCH_CHECK( + false, "transpose_pack_vnni4 is only supported when AVX-512 is supported") +#endif +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h new file mode 100644 index 0000000000000000000000000000000000000000..6412a44107cffbc1c6793dd7f14a1c711875a358 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/ADInterpreters.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::functorch { + +// These are the interpreters for our AD transforms +// (grad, vjp and jvp). +// See NOTE: [functorch interpreter stack] for more details. + +struct TORCH_API GradInterpreterPtr { + explicit GradInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Grad); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + bool prevGradMode() const { + return std::get(base_->meta()).prevGradMode_; + } + Tensor lift(const Tensor& tensor) const; + private: + const Interpreter* base_; +}; + +struct TORCH_API JvpInterpreterPtr { + explicit JvpInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Jvp); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + bool prevFwdGradMode() const { + return std::get(base_->meta()).prevFwdGradMode_; + } + Tensor lift(const Tensor& tensor) const; + private: + const Interpreter* base_; +}; + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h new file mode 100644 index 0000000000000000000000000000000000000000..befe73ec33e2006a69c5ebe5ce741d5f04d2114f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedFallback.h @@ -0,0 +1,86 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include +#include + +namespace at::functorch { + +// This file contains code for the vmap fallback (also known as the +// BatchedTensor fallback or the Batched fallback). This code runs +// when an operation doesn't have a batching rule implemented. + +// If an operator doesn't have a batching rule implemented then we fallback +// to this implementation. The fallback doesn't work on out= variants or +// view operations; that is, it works for out-of-place operations and +// in-place non-view operations. +// +// For out-of-place operations, the fallback effectively takes all of the +// BatchedTensors in `stack`, slices them, and runs `op` on all of the +// corresponding slices to produce slices of the outputs. The output slices +// then get `torch.stack`ed to create the +// final returns. +// +// The performance of the fallback is not very good because it introduces an +// extra copy from stacking the sliced outputs. Because of this, we prefer to +// write batching rules for operators whenever possible. +void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); +void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +// The vmap fallback emits a warning by default, but it may be disabled if +// the user finds it to be too annoying. +TORCH_API bool isVmapFallbackWarningEnabled(); +TORCH_API void setVmapFallbackWarningEnabled(bool enabled); + +// Used for testing. The vmap fallback is enabled by default. When it is disabled, +// it raises an error. +TORCH_API bool isVmapFallbackEnabled(); +TORCH_API void setVmapFallbackEnabled(bool enabled); + +template A vector_to_result(const std::vector& buffer) { + return buffer[0].to(); +} +template std::tuple vector_to_result(const std::vector& buffer) { + return std::make_tuple(buffer[0].to(), buffer[1].to()); +} +template std::tuple vector_to_result(const std::vector& buffer) { + return std::make_tuple(buffer[0].to(), buffer[1].to(), buffer[2].to()); +} + +// slow_fallback is a way to call the vmap fallback inside some boxed kernel. +// There is probably some better way to metaprogram this. +template +Ret slow_fallback(const c10::OperatorHandle& op, ArrayRef args) { + std::vector stack(args.begin(), args.end()); + batchedTensorForLoopFallback(op, &stack); + return vector_to_result(stack); +} + +template +std::tuple slow_fallback(const c10::OperatorHandle& op, ArrayRef args) { + std::vector stack(args.begin(), args.end()); + batchedTensorForLoopFallback(op, &stack); + return vector_to_result(stack); +} + +template +std::tuple slow_fallback(const c10::OperatorHandle& op, ArrayRef args) { + std::vector stack(args.begin(), args.end()); + batchedTensorForLoopFallback(op, &stack); + return vector_to_result(stack); +} + + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..1424c85bd62b78ec0a8bc3c4bb190db50a4dc200 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/BatchedTensorImpl.h @@ -0,0 +1,181 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include + +#include +#include +#include + +namespace at::functorch { + +using Tensor = at::Tensor; + +// We assume this in a few other places in the codebase, +// but there isn't a centralized definition. +constexpr int64_t kVmapMaxTensorDims = 64; + +// The valid vmap levels range from [0, 64). This effectively means that we +// support a maximum of 64 nested vmaps. +constexpr int64_t kVmapNumLevels = 64; + +// Store this number of elements of BatchDims on the stack. Most people will +// probably use <= 5 nested vmaps, but adjust this number as necessary. +constexpr int64_t kBatchDimsStackSize = 5; + +// A BatchedTensorImpl holds an underlying Tensor and a single batch dim +// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a +// BatchedTensorImpl. +// +// The batch dimensions are treated as being "private"; they are not user-visible. +// For example, in the following Tensor, +// bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0) +// dimension 0 is batch dimension. +// +// bt.sizes() returns (5, 7); bt.sum(0) performs a reduction over the (public) +// dim 0, which is equivalent to dim 3 in the underlying ones(2, 3, 5, 7) tensor. +struct TORCH_API BatchedTensorImpl : public c10::TensorImpl { + explicit BatchedTensorImpl(at::DispatchKeySet key_set, Tensor value, int64_t dim, int64_t level); + + // Returns batch dimension of this tensor + int64_t bdim() const { return bdim_; } + + // Returns batch dimension of this tensor + int64_t level() const { return level_; } + + // BatchedTensorImpl wraps a Tensor + const Tensor& value() const { return value_; } + + // Given a public dimension index, return the dimension index in the underlying + // value() tensor. + // For example, if we have + // bt = BatchedTensorImpl(ones(2, 3, 5, 7), lvl=1, dim=0) + // bt.actualDim(0) -> 1 + // bt.actualDim(1) -> 2 + // bt.actualDim(2) -> 3 + // bt.actualDim(3) -> Error + int64_t actualDim(int64_t dim, bool wrap_dim = true) const; + + IntArrayRef sizes_custom() const override; + SymIntArrayRef sym_sizes_custom() const override; + int64_t size_custom(int64_t d) const override; + c10::SymInt sym_size_custom(int64_t d) const override; + // We have to override this because we opted into CustomStrides + IntArrayRef strides_custom() const override; + SymIntArrayRef sym_strides_custom() const override; + // Override a bunch of methods inherited from TensorImpl to return error messages. + c10::SymBool sym_is_contiguous_custom(at::MemoryFormat memory_format) const override; + void set_size(int64_t dim, int64_t new_size) override; + void set_stride(int64_t dim, int64_t new_stride) override; + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override; + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override; + void shallow_copy_from(const c10::intrusive_ptr& impl) override; +#ifdef DEBUG + bool has_storage() const override; +#endif + + void refreshTensorMetadata(); + + // Used in torchdim. torchdim uses non-lexical BatchedTensor; the way it + // accomplishes this is a hack where it is able to modify the levels of + // BatchedTensor to match the level of the current vmap transform. + void _unsafe_set_level(int64_t level) { + level_ = level; + } + + // Used in batching rule for in-place view operations that can change + // the index of the bdim (think squeeze_, unsqueeze_) + void unsafe_set_bdim(int64_t bdim) { + // NB: you MUST call refreshTensorMetadata after doing this. + bdim_ = bdim; + } + private: + // see NOTE: [BatchedTensorImpl levels invariant] + void checkInvariants() const; + const char* tensorimpl_type_name() const override; + + Tensor value_; + + int64_t level_; + int64_t bdim_; +}; + +// NB: We use the term "BatchedTensor" to mean a Tensor that is backed with a +// BatchedTensorImpl. +inline bool isBatchedTensor(const Tensor& tensor) { + return tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::FuncTorchBatched) || + tensor.unsafeGetTensorImpl()->key_set().has(DispatchKey::BatchedNestedTensor); +} + +// It is unsafe to call this on a Tensor that is not backed by a +// BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible. +inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) { + return static_cast(tensor.unsafeGetTensorImpl()); +} + +inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) { + if (!isBatchedTensor(tensor)) { + return nullptr; + } + return unsafeGetBatchedImpl(tensor); +} + +// Returns a bitset. If bit i is set, then that means dim i is a batchdim. +inline std::bitset createBatchDimBitset(int64_t dim) { + std::bitset is_bdim; + is_bdim.set(dim); + return is_bdim; +} + +// Creates a bitset for the given level +inline std::bitset createVmapLevelsBitset(int64_t level) { + std::bitset result; + result.set(level); + return result; +} + +// Use this to construct a BatchedTensor from a regular Tensor +TORCH_API Tensor makeBatched(Tensor tensor, int64_t dim, int64_t level); + +// Adds a batch dim to `tensor`, returning a BatchedTensor +TORCH_API Tensor addBatchDim(Tensor tensor, int64_t dim, int64_t level); + +// Certain dispatch keys must be propagated to the BatchedTensor (or, in general, +// any wrapper Tensor subclasses). This is because there are methods on Tensor +// that skip dispatch and check for the presence of a dispatch key (e.g. is_cpu()). +// TODO: should probably contain more (or all?) backend keys +constexpr DispatchKeySet kKeysToPropagateToWrapper({ + DispatchKey::Negative, + DispatchKey::Conjugate, + DispatchKey::XLA, + DispatchKey::XPU, + DispatchKey::HPU, + DispatchKey::CUDA, + DispatchKey::CPU, + DispatchKey::PrivateUse1, + DispatchKey::SparseCPU, + DispatchKey::SparseCUDA, + DispatchKey::SparseCsrCPU, + DispatchKey::SparseCsrCUDA, +}); + +inline DispatchKeySet getKeysToPropagateToWrapper(const Tensor& tensor, DispatchKeySet to_propagate=kKeysToPropagateToWrapper) { + auto key_set = tensor.unsafeGetTensorImpl()->key_set(); + return key_set & kKeysToPropagateToWrapper; +} + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..56425b673e7f325dc9ca5749b218e77e825e0de1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/DynamicLayer.h @@ -0,0 +1,129 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Forward declared +namespace c10 { struct AutogradMetaInterface; } + +namespace at::functorch { + +// This file contains the implementation of functorch's interpreter stack. +// See NOTE: [functorch interpreter stack] first before reading on. +// +// NB: the functorch interpreter stack is also referred to as: +// - the "dynamic layer stack" -- an older name for "interpreter" was +// "dynamic layer". +// - the "functorch mode stack". You can think of each functorch transform as a +// "mode" (in the same sense as torch_dispatch mode or torch_function mode), +// and functorch being an implementation of a "mode stack" where the modes +// may be arbitrary composed. + +// DynamicLayer is basically the same thing as an Interpreter. +// It represents a functorch transform and it holds an Interpreter, +// which contains metadata related to the transform and instructions on +// how to perform the transform. +// +// TODO: we can excise DynamicLayer in favor of Interpreter, +// But I am going to leave it for now as a compatibility shim to avoid +// needing to refactor a lot of callsites... +struct TORCH_API DynamicLayer { + explicit DynamicLayer( + TransformType transform_type, + int64_t layerId, + std::optional batchSize = std::nullopt, + std::optional randomness = std::nullopt, + std::optional prev_grad_mode = std::nullopt, + std::optional pre_fwd_grad_mode = std::nullopt, + std::optional functionalize_add_back_views = std::nullopt); + + TransformType key() const; + int64_t layerId() const; + + const Interpreter& interpreter() const { return interpreter_; } + Interpreter& interpreter() { return interpreter_; } + + // Only valid for vmap + c10::SymInt batchSize() const; + RandomnessType randomness() const; + + private: + Interpreter interpreter_; +}; + +TORCH_API int64_t initAndPushDynamicLayer( + TransformType transform_type, + std::optional batch_size = std::nullopt, + std::optional randomness = std::nullopt, + std::optional prev_grad_mode = std::nullopt, + std::optional prev_fwd_grad_mode = std::nullopt, + std::optional functionalize_add_back_views = std::nullopt); +TORCH_API DynamicLayer popDynamicLayerAndDeleteMetadata(); +TORCH_API std::optional maybeCurrentDynamicLayer(); +TORCH_API const std::vector& getDynamicLayerStack(); +TORCH_API void setDynamicLayerStack(const std::vector& stack); +TORCH_API void setDynamicLayerFrontBackKeysIncluded(bool included); + +// NOTE: [Life handles and lexically scoped transforms] +// functorch transforms are lexically scoped. +// Given a level, we store a "life handle" that is a boolean that tells us if the +// transform with that level is active or not. +// +// functorch's TensorWrapper (for grad transforms) stores a life handle. +// If a TensorWrapper escapes from the scope of the transform, then somehow +// it must know it escaped; it can tell by querying the life handle. +TORCH_API const std::shared_ptr& getLifeHandleForLevel(int64_t level); + +// Returns if an operator is in-place. An operator is inplace if: +// 1. The first argument is a Tensor and it is being written to +// 2. The first argument is being returned +// 3. No other arguments are aliased +// Here is an example of an in-place operator: +// add_(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!) +TORCH_API bool isInplaceOp(const c10::FunctionSchema& schema); + +// Given the indices of unwrapped inputs and the schema, this returns the indices of any outputs that should remain unwrapped +TORCH_API std::optional findAliasedOutput(const FunctionSchema& schema, const int64_t immutable_input); + +TORCH_API Tensor unwrapIfDead(const Tensor& tensor); +TORCH_API bool isDeadTensorWrapper(const Tensor& tensor); + +// Pretty printers +TORCH_API std::ostream& operator<<(std::ostream& os, const DynamicLayer& layer); +TORCH_API std::ostream& operator<<(std::ostream& os, const std::vector& dynamicLayerStack); + +// While a functorch transform is active, torch.autograd.function._SingleLevelFunction +// is disabled by default. The following two APIs are APIs for enabling +// it. These are not user-facing APIs. We can delete this in the future, but +// it is useful for debugging when something goes wrong with the +// autograd.Function <> functorch interaction, which uses _SingleLevelFunction, +// because it leads to loud errors if something is incorrect. +TORCH_API void setSingleLevelAutogradFunctionAllowed(bool allowed); +TORCH_API bool getSingleLevelAutogradFunctionAllowed(); + +// While a functorch grad transform is active, Tensor.requires_grad_() gets +// disabled. These two functions are the mechanism to controlling that. +TORCH_API void setInplaceRequiresGradAllowed(bool allowed); +TORCH_API bool getInplaceRequiresGradAllowed(); + +TORCH_API DynamicLayer popDynamicLayer(); +TORCH_API int64_t pushDynamicLayer(DynamicLayer&& layer); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h new file mode 100644 index 0000000000000000000000000000000000000000..7b11100e7075cdb5b0d45dd9c6c5844608c504fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/FunctionalizeInterpreter.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::functorch { + +// This is the interpreter that handles the functionalize() transform. +// See NOTE: [functorch interpreter stack] for more details. + +struct FunctionalizeInterpreterPtr { + explicit FunctionalizeInterpreterPtr(const Interpreter* base): base_(base) { TORCH_INTERNAL_ASSERT(base->key() == TransformType::Functionalize); } + TransformType key() const { return base_->key(); } + int64_t level() const { return base_->level(); } + void processImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreterImpl(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + bool functionalizeAddBackViews() const { + return std::get(base_->meta()).functionalizeAddBackViews_; + } + private: + const Interpreter* base_; +}; + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h new file mode 100644 index 0000000000000000000000000000000000000000..25c8e6f0dac29208608e4e2ec6582f3163a3cbe8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/Interpreter.h @@ -0,0 +1,358 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at::functorch { + +// NOTE: [functorch interpreter stack] +// +// functorch's dispatching system uses a stack of interpreters. +// Historically we've referred to this as the "DynamicLayerStack". +// +// An interpreter is something that reads in the code it is passed +// and then executes it. We have a different interpreter per-transform: +// the "VmapInterpreter" is responsible for reading in operators (like aten::mv) +// and executing the batched version of it (the batching rule for aten::mv). +// +// Concretely, each interpreter is responsible for two things: +// +// 1) process(ophandle, stack) +// Given an operator handle and a stack of arguments, the interpreter is +// responsible for figuring out how to execute the operation under the semantics +// of the interpreter. For e.g. VmapInterpreter, this is figuring out how to call +// the batching rule. +// +// The batching rules are stored as kernels on the FuncTorchBatched key, so the way +// VmapInterpreter calls the batching rule is roughly: (A) exclude all +// dispatch keys aside from the Batched key, (B) redispatch so we get to the +// Batched key. +// +// 2) sendToNextInterpreter(ophandle, stack) +// The VmapInterpreter, when it sees aten::mv, will process it into a call to +// aten::mm. It then needs to send the call to aten::mm to the next interpreter +// in the interpreter stack. +// +// The VmapInterpreter just does this via a call to ophandle.callBoxed(stack) +// and most Interpreters will implement it this way. + +enum class RandomnessType { + Error, // always errors when calling a random function + Same, // randomness appears the same across batches + Different, // randomness appears different across batches + END +}; + +enum class TransformType { + Torch, // Unused + Vmap, + Grad, // reverse-mode AD, aka vjp + Jvp, // forward-mode AD + Functionalize, +}; + +std::ostream& operator<<(std::ostream& os, const TransformType& t); + +// NOTE: [Interpreter "subclassing" design] +// +// How are various Interpreters for different transforms (vmap, grad, ...) +// implemented? +// +// Accessing interpreters is in the hot-path of functorch so we have a constraint +// that this code must be as fast as possible. +// +// As a result, we stay away from virtual methods and this causes our code +// to look a little funny. +// +// `Interpreter` is the struct for Interpreters. It holds ALL of the +// relevant information (what type of interpreter it is and the metadata). +// Metadata for each interpreter is represented as a Union (std::variant) +// of all possible metadata (VmapInterpreterMeta, GradInterpreterMeta, ...). +// +// Given an Interpreter, how do I get a "VmapInterpreter"? You may wish to do this +// if you want to access the metadata fields (like batchSize and randomness). +// +// Each type of interpreter (e.g. Vmap) has a convenience struct +// (e.g. VmapInterpreterPtr) associated with it. +// +// Construct the convenience struct with VmapInterpreterPtr(Interpreter*), +// and then one can access methods on VmapInterpreterPtr like so: +// >>> VmapInterpreterPtr(&interpreter).batchSize() +// +// Finally, Interpreter::process switches on the type of the interpreter +// and calls one of {Transform}Interpreter::processImpl under the hood. +// Same for Interpreter::sendToNextInterpreter :) + +struct VmapInterpreterMeta { + explicit VmapInterpreterMeta(c10::SymInt batchSize, RandomnessType randomness) : + batchSize_(std::move(batchSize)), randomness_(randomness) {} + + c10::SymInt batchSize_; + RandomnessType randomness_; + + VmapInterpreterMeta() = default; + VmapInterpreterMeta(const VmapInterpreterMeta&) = default; + VmapInterpreterMeta(VmapInterpreterMeta&&) = default; + VmapInterpreterMeta& operator=(const VmapInterpreterMeta&) = default; + VmapInterpreterMeta& operator=(VmapInterpreterMeta&&) = default; + ~VmapInterpreterMeta() = default; + + template + friend void to_json(T& json_j, const VmapInterpreterMeta& json_t) { + TORCH_CHECK( + !json_t.batchSize_.is_heap_allocated(), + "Serialization for heap-allocated SymInt is not implemented yet" + ); + json_j["batchSize"] = json_t.batchSize_.as_int_unchecked(); + json_j["randomness"] = static_cast(json_t.randomness_); + } + + template + friend void from_json(const T& json_j, VmapInterpreterMeta& json_t) { + json_t.batchSize_ = c10::SymInt(SymInt::Unchecked::UNCHECKED, json_j["batchSize"]); + json_t.randomness_ = static_cast(json_j["randomness"]); + } +}; + +struct GradInterpreterMeta { + explicit GradInterpreterMeta(bool prevGradMode): prevGradMode_(prevGradMode) {} + GradInterpreterMeta() = default; + GradInterpreterMeta(const GradInterpreterMeta&) = default; + GradInterpreterMeta(GradInterpreterMeta&&) = default; + GradInterpreterMeta& operator=(const GradInterpreterMeta&) = default; + GradInterpreterMeta& operator=(GradInterpreterMeta&&) = default; + ~GradInterpreterMeta() = default; + + bool prevGradMode_; + template + friend void to_json(T& json_j, const GradInterpreterMeta& json_t) { + json_j["prevGradMode"] = json_t.prevGradMode_; + } + + template + friend void from_json(const T& json_j, GradInterpreterMeta& json_t) { + json_t.prevGradMode_ = json_j["prevGradMode"]; + } +}; + +struct JvpInterpreterMeta { + explicit JvpInterpreterMeta(bool prevFwdGradMode) : prevFwdGradMode_(prevFwdGradMode) {} + JvpInterpreterMeta() = default; + JvpInterpreterMeta(const JvpInterpreterMeta&) = default; + JvpInterpreterMeta(JvpInterpreterMeta&&) = default; + JvpInterpreterMeta& operator=(const JvpInterpreterMeta&) = default; + JvpInterpreterMeta& operator=(JvpInterpreterMeta&&) = default; + ~JvpInterpreterMeta() = default; + + bool prevFwdGradMode_; + template + friend void to_json(T& json_j, const JvpInterpreterMeta& json_t) { + json_j["prevFwdGradMode"] = json_t.prevFwdGradMode_; + } + + template + friend void from_json(const T& json_j, JvpInterpreterMeta& json_t) { + json_t.prevFwdGradMode_ = json_j["prevFwdGradMode"]; + } +}; + +struct FunctionalizeInterpreterMeta { + explicit FunctionalizeInterpreterMeta(bool functionalizeAddBackViews) : + functionalizeAddBackViews_(functionalizeAddBackViews) {} + FunctionalizeInterpreterMeta() = default; + FunctionalizeInterpreterMeta(const FunctionalizeInterpreterMeta&) = default; + FunctionalizeInterpreterMeta(FunctionalizeInterpreterMeta&&) = default; + FunctionalizeInterpreterMeta& operator=(const FunctionalizeInterpreterMeta&) = default; + FunctionalizeInterpreterMeta& operator=(FunctionalizeInterpreterMeta&&) = default; + ~FunctionalizeInterpreterMeta() = default; + + bool functionalizeAddBackViews_; + template + friend void to_json(T& json_j, const FunctionalizeInterpreterMeta& json_t) { + json_j["functionalizeAddBackViews"] = json_t.functionalizeAddBackViews_; + } + + template + friend void from_json(const T& json_j, FunctionalizeInterpreterMeta& json_t) { + json_t.functionalizeAddBackViews_ = json_j["functionalizeAddBackViews"]; + } +}; + +typedef std::variant< + int64_t, + GradInterpreterMeta, + JvpInterpreterMeta, + VmapInterpreterMeta, + FunctionalizeInterpreterMeta +> InterpreterMeta; + + +struct Interpreter { + // factory functions + static Interpreter Vmap(int64_t level, c10::SymInt batchSize, RandomnessType randomness) { + return Interpreter(TransformType::Vmap, level, VmapInterpreterMeta(std::move(batchSize), randomness)); + } + static Interpreter Grad(int64_t level, bool prevGradMode) { + return Interpreter(TransformType::Grad, level, GradInterpreterMeta(prevGradMode)); + } + static Interpreter Jvp(int64_t level, bool prevFwdGradMode) { + return Interpreter(TransformType::Jvp, level, JvpInterpreterMeta(prevFwdGradMode)); + } + static Interpreter Functionalize(int64_t level, bool functionalizeAddBackViews) { + return Interpreter(TransformType::Functionalize, level, FunctionalizeInterpreterMeta(functionalizeAddBackViews)); + } + + // methods + TransformType key() const { return type_; } + int64_t level() const { return level_; } + const InterpreterMeta& meta() const { return meta_; } + + void process(const c10::OperatorHandle& op, torch::jit::Stack* stack); + void sendToNextInterpreter(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool grad_special_case); + + void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) { + TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value()); + savedLocalDispatchKeySet_ = keyset; + } + void clearSavedLocalDispatchKeySet() { + TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value()); + savedLocalDispatchKeySet_ = std::nullopt; + } + c10::impl::LocalDispatchKeySet getSavedLocalDispatchKeySet() const { + TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value()); + return *savedLocalDispatchKeySet_; + } + + // An Interpreter is alive if we are currently inside the ongoing transform + // for the interpreter. For example, vmap(f)(x); inside of f, the vmap's + // corresponding Interpreter is alive, even when it is not on the DynamicLayerStack. + bool is_alive() const { + return *is_alive_; + } + const std::shared_ptr& is_alive_ptr() const { + return is_alive_; + } + void set_is_alive(bool alive) { + *is_alive_ = alive; + } + + // Please don't use this + explicit Interpreter() = default; + + template + friend void to_json(T& json_j, const Interpreter& json_t) { + json_j["type"] = static_cast(json_t.type_); + json_j["level"] = json_t.level_; + if (json_t.savedLocalDispatchKeySet_) { + json_j["savedLocalDispatchKeySet"] = { + {"included", json_t.savedLocalDispatchKeySet_->included_.raw_repr()}, + {"excluded", json_t.savedLocalDispatchKeySet_->excluded_.raw_repr()} + }; + } else { + json_j["savedLocalDispatchKeySet"] = nlohmann::json(); + } + json_j["is_alive"] = *json_t.is_alive_; + std::visit([&](auto&& arg) { + using V = std::decay_t; + if constexpr (std::is_same_v) { + json_j["meta"] = {{"Torch", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Grad", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Jvp", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Vmap", arg}}; + } else if constexpr (std::is_same_v) { + json_j["meta"] = {{"Functionalize", arg}}; + } else { + static_assert(false && sizeof(V), "unknown variant case"); + } + }, json_t.meta_); + } + + template + friend void from_json(const T& json_j, Interpreter& json_t) { + json_t.type_ = static_cast(json_j["type"]); + json_t.level_ = json_j["level"]; + auto savedLocalDispatchKeySet = json_j["savedLocalDispatchKeySet"]; + if (savedLocalDispatchKeySet.is_null()) { + json_t.savedLocalDispatchKeySet_ = std::nullopt; + } else { + c10::impl::PODLocalDispatchKeySet pod; + pod.set_included(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["included"].template get())); + pod.set_excluded(DispatchKeySet::from_raw_repr(savedLocalDispatchKeySet["excluded"].template get())); + json_t.savedLocalDispatchKeySet_ = c10::impl::LocalDispatchKeySet(pod); + } + json_t.is_alive_ = std::make_shared(json_j["is_alive"]); + auto meta = json_j["meta"]; + if (meta.contains("Torch")) { + json_t.meta_.emplace(meta["Torch"].template get()); + } else if (meta.contains("Grad")) { + json_t.meta_.emplace(meta["Grad"].template get()); + } else if (meta.contains("Jvp")) { + json_t.meta_.emplace(meta["Jvp"].template get()); + } else if (meta.contains("Vmap")) { + json_t.meta_.emplace(meta["Vmap"].template get()); + } else if (meta.contains("Functionalize")) { + json_t.meta_.emplace(meta["Functionalize"].template get()); + } else { + TORCH_CHECK(false, "unknown interpreter metadata type"); + } + } + + std::string serialize() const { + return nlohmann::json(*this).dump(); + } + + static Interpreter deserialize(const std::string& serialized) { + return nlohmann::json::parse(serialized).get(); + } + + private: + explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta): + type_(type), level_(level), is_alive_(std::make_shared(false)), meta_(std::move(meta)) {} + + // fields + TransformType type_{}; + int64_t level_{}; + std::optional savedLocalDispatchKeySet_; + std::shared_ptr is_alive_; + InterpreterMeta meta_; +}; + +// Applies the following for-loop: +// for i in range(begin, end): +// args[i] = func(args[i]) +void foreachTensorInplace(std::vector& args, int64_t begin, int64_t end, + std::function func); + +// Applies the following for-loop: +// for i in range(begin, end): +// if use_flag_relative[i] == 1: <-- treats use_flag_relative as a bitset +// args[i] = func(args[i], i - begin, true) +// args[i] = func(args[i], i - begin) +void foreachTensorInplaceWithFlag(std::vector& args, int64_t begin, int64_t end, + const std::bitset<64> use_flag_relative, const std::function& func); + +std::vector findUnwrappedInputs(std::vector& args, int64_t begin, int64_t end); + +DispatchKeySet keysToExcludeWhenEnteringDynamicLayer(TransformType key); + +void setup_dispatch_key_tls(TransformType key, DispatchKeySet include); + +void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/PlumbingHelper.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/PlumbingHelper.h new file mode 100644 index 0000000000000000000000000000000000000000..4ba1c6bf041735558f119d0921921d2b10ba0d9f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/PlumbingHelper.h @@ -0,0 +1,68 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. +#pragma once +#include +#include +#include + +// NOTE: [vmap plumbing] +// +// Here's how "batching rules" work. +// - we register kernels to the Batched key +// - these kernels have the same signatures as the original operators. +// For example, at::sin(Tensor self) accepts a Tensor, and the batched kernel +// must also accept a Tensor +// - However, it is more natural for users to write a batching rule like the +// following: sin_batch_rule(Tensor self, std::optional self_bdim) +// - There is some codegenerated layer (the "plumbing") that wraps the user +// defined batching rule (e.g. sin_batch_rule) in a kernel that can be +// registered to the Batched key. +// +// The plumbing is responsible for wrapping a batching rule into a form that may +// be registered as the kernel for the batched key. + +namespace at::functorch { + +void vmap_check_escaped(const std::optional &layer, const char* what); + +// Create a BatchedTensor given a tensor, bdim, and level +TORCH_API Tensor makeBatched(Tensor tensor, std::optional bdim, int64_t level); + +// Given a Tensor that may or may not be a BatchedTensor, unwrap it. +// If `tensor` is not a BatchedTensor, or is a BatchedTensor but the level +// doesn't match, then this returns (tensor, std::nullopt). +// Otherwise, it returns (unwrap(tensor), bdim). +TORCH_API std::tuple> unwrapTensorAtLevel(const Tensor& tensor, int64_t level); + +// Creates a vector of BatchedTensor +TORCH_API std::vector makeBatchedVector(std::vector tensors, std::optional bdim, int64_t level); + +// Returns True if ANY tensor in tensors is batched at level +TORCH_API bool isBatchedAtLevel(ITensorListRef tensors, int64_t level); +TORCH_API bool isBatchedAtLevel(const c10::List>& maybe_tensors, int64_t level); +TORCH_API bool isBatchedAtLevel(const Tensor& tensor, int64_t level); +TORCH_API bool isBatchedAtLevel(const std::optional& maybe_tensor, int64_t level); + +// Convenience helper. Returns true if any tensor is batched at level +TORCH_API bool areAnyBatchedAtLevel(ArrayRef> maybe_tensors, int64_t level); + +inline bool ivalueParticipatesInCurrentLevel(const IValue& ivalue) { + if (ivalue.isTensor()) { + auto maybe_level = maybeCurrentDynamicLayer(); + TORCH_INTERNAL_ASSERT(maybe_level.has_value()); + auto current_level = maybe_level->layerId(); + return isBatchedAtLevel(ivalue.toTensor(), current_level); + } + // TODO: should really check this + return false; +} + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/TensorWrapper.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/TensorWrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..4f2d6f5c708b58f2997d40051c7bdbe9cf3f236f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/functorch/TensorWrapper.h @@ -0,0 +1,108 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright (c) Facebook, Inc. and its affiliates. +// All rights reserved. +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include +#include + +namespace at::functorch { + +// NOTE: [functorch's TensorWrapper] +// +// Taking better suggestions for a name. TensorWrapper is the wrapper Tensor +// Subclass for functorch's grad-based transforms (grad, vjp, jvp). It is +// analogous to how vmap uses BatchedTensor as the wrapper Tensor subclass. +// +// If you're familiar with the Tensor-Variable merge, TensorWrapper is effectively +// another Variable. +// +// Consider grad(grad(torch.sin))(x). This wraps `x` as TensorWrapper(TensorWrapper(x)). +// The reason why is so that each TensorWrapper can hold its own AutogradMeta and +// participate in a **separate** autograd graph. +// +// There are alternative designs we could have chosen (e.g. each grad transform +// stores a weak map of Tensor -> AutogradMeta); the benefit of the TensorWrapper +// design is that we can reuse existing VariableType kernels (i.e. Autograd kernels) +// without much modification. Since a TensorWrapper looks like a regular Tensor, +// the VariableType kernel can pull out the AutogradMeta struct from where it +// expects and extend the autograd graph + +struct TORCH_API TensorWrapper : public c10::TensorImpl { + explicit TensorWrapper( + c10::DispatchKeySet key_set, + Tensor value, + int64_t level, + std::shared_ptr is_alive, + bool is_immutable = false, // if true, this came from an operation that aliases an immutable tensor + bool use_value_sizes_strides = true); + + void refreshMetadata(); + + const Tensor& value() const { + return value_; + } + std::optional level() const { + if (is_alive()) { + return level_; + } + return {}; + } + bool is_immutable() const { + return is_immutable_; + } + bool is_alive() const; + + // Overrides necessary for autograd + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override; + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override; + void shallow_copy_from(const c10::intrusive_ptr& impl) override; + + private: + const char* tensorimpl_type_name() const override; + Tensor value_; + int64_t level_; + bool is_immutable_; + + // TensorWrapper receives a boolean flag on whether or not the Grad Interpreter + // that created it is still alive or not. + // If the Grad Interpreter is no longer alive then it attempts to behave like + // a regular Tensor. + // + // When we exit the level, this wrapper may be marked as "not alive". + // Wrappers that are not alive: + // 1) May still have autograd metadata on them + // 2) Forward dispatches to the underlying value() + std::shared_ptr is_alive_; +}; + +// There are two variants of makeTensorWrapper: one that accepts a level +// and one that accepts an Interpreter. +// +// The one that accepts a level tries to automatically get the life handle from the +// interpreter on the DynamicLayerStack. +// It needs to be used with caution: if the interpreter is not on the +// DynamicLayerStack, then we won't be able to find the life handle. +// +// In practice this isn't a problem: when we're constructing TensorWrapper in +// Python, the corresponding interpreter is on the stack. +TORCH_API Tensor makeTensorWrapper(const Tensor& tensor, int64_t level, bool is_immutable=false); +TORCH_API Tensor makeTensorWrapper(const Tensor& tensor, const Interpreter& interpreter, bool is_immutable=false); +TORCH_API TensorWrapper* maybeGetTensorWrapper(const Tensor& tensor); +TORCH_API void dumpTensor(std::ostream & ss, const Tensor& tensor); +TORCH_API void dumpTensorCout(const Tensor& tensor); + +} // namespace at::functorch + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..1a716caaaabe88e4ef2fbcb416a5badb181029c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/fbgemm_utils.h @@ -0,0 +1,102 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#ifdef USE_FBGEMM +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") +#include +#include +#include +C10_DIAGNOSTIC_POP() + + +namespace ao::sparse { + +struct TORCH_API PackedLinearWeight + : public LinearPackedParamsBase { + PackedLinearWeight(std::unique_ptr> w, + std::optional bias, + std::vector col_offsets, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme, + const int64_t out_features_block_size /* block sparsity size across output_features */, + const int64_t in_features_block_size /* block sparsity size across input_features */) + : LinearPackedParamsBase( + out_features_block_size, + in_features_block_size), + w(std::move(w)), + bias_(std::move(bias)), + col_offsets(std::move(col_offsets)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(q_scheme) {} + std::unique_ptr> w; + std::optional bias_; + std::vector col_offsets; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(const at::Tensor& input) override { + TORCH_INTERNAL_ASSERT( + false, + "Sparse quantized dynamic linear with fused relu is not yet " + "supported on qnnpack backend."); + return at::Tensor(); + } + at::Tensor apply_dynamic_relu(const at::Tensor& input) override { + TORCH_INTERNAL_ASSERT( + false, + "Sparse quantized dynamic linear with fused relu is not yet " + "supported on qnnpack backend."); + return at::Tensor(); + } + + LinearPackedSerializationType unpack() override; + + BCSRSerializationType serialize() override; + + static c10::intrusive_ptr deserialize( + const BCSRSerializationType& serialized); + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + const at::Tensor& weight, + const std::optional& bias, + const int64_t out_features_block_size, + const int64_t in_features_block_size); + + private: + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +}; + +} // namespace ao::sparse + +#endif // USE_FBGEMM + +namespace ao::sparse { +int register_linear_params(); +} // namespace ao::sparse + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/packed_params.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/packed_params.h new file mode 100644 index 0000000000000000000000000000000000000000..191b1e160cb4ffdb7e24c75d45b1a82c3cc4a7b7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/packed_params.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include + +namespace ao::sparse { + +// +using LinearPackedSerializationType = + std::tuple, std::vector>; + +#define SPARSE_LINEAR_PACKED_PARAM_SERIALIZATION_VERSION 2 + +using BCSRSerializationType = + std::tuple< + int64_t, // Serialization Version + std::optional, // Bias + int64_t, // Out Features (Row) Block Size + int64_t, // In Features (Column) Block Size + at::Tensor, // Weight Scales (single element vector if per-tensor) (float) + at::Tensor, // Wrapper for Weight Zero Points (single element vector if per-tensor) (int8_t) + bool, // Quantization Scheme (true: per tensor, false: per channel) + at::Tensor, // Wrapper for Row Block Indices (int8_t, int16_t, or int32_t) + at::Tensor, // Wrapper for Column Block Indices (int8_t, int16_t, or int32_t) + at::Tensor, // Wrapper for Non-Zero Weight Values, each +128 (uint8_t) + int64_t, // Number of Output Channels + int64_t // Number of Input Channels + >; + +using BCSR = + std::tuple< + std::vector, // Non-Zero Weight Values + std::vector, // Compressed Row Block Indices + std::vector // Column Block Indices + >; + +struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { + public: + LinearPackedParamsBase( + const int64_t out_features_block_size, + const int64_t in_features_block_size) + : out_features_block_size_(out_features_block_size), + in_features_block_size_(in_features_block_size) {} + + virtual at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + + virtual at::Tensor apply_dynamic(const at::Tensor& input) = 0; + virtual at::Tensor apply_dynamic_relu(const at::Tensor& input) = 0; + + virtual LinearPackedSerializationType unpack() = 0; + + virtual BCSRSerializationType serialize() = 0; + + virtual std::optional bias() = 0; + + virtual void set_bias(const std::optional& bias) { + TORCH_CHECK(false, "set_bias is not implemented for this packed parameter type"); + } + + protected: + const int64_t out_features_block_size_, in_features_block_size_; +}; + +} // namespace ao::sparse + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..cdd7f91cb49c918f4a1a64d174f05dd16a84e144 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/ao_sparse/quantized/cpu/qnnpack_utils.h @@ -0,0 +1,95 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#ifdef USE_PYTORCH_QNNPACK +// TODO: Refacto QnnpackUtils.h so as to separate code +// needed for quantized op from the generic qnnpack specific +// quantization utilities. +#include +#include +#include + +namespace ao::sparse { + +struct TORCH_API PackedLinearWeightQnnp : public LinearPackedParamsBase { + PackedLinearWeightQnnp(const at::Tensor& weight, const std::optional& bias, const int64_t out_features_block_size /* block sparsity size across output_features */, const int64_t in_features_block_size /* block sparsity size across input_features */); + explicit PackedLinearWeightQnnp(const BCSRSerializationType& serialized); + std::optional orig_bias_; + // Separate copy of bias exist so that we can fill in zeros when + // optional bias does not exist. This is to compy with qnnpack operator that + // expects bias to be present. + // In case bias is present bias_ is just a reference to orig_bias_ + at::Tensor bias_; + c10::QScheme q_scheme_; + double input_scale_{}; + std::unique_ptr bcsr_matrix_; + at::Tensor w_scales_; + std::vector w_zero_points_; + std::vector requantization_scales_; + std::unique_ptr + sparse_linear_op_{nullptr}; + int64_t output_channels_; + int64_t input_channels_; + // Deserialized Tensors are stored to maintain the lifetime of underlying + // BCSR data. + // These are left empty if PackedLinearWeightQnnp is created via prepacking + // rather than deserializing. + at::Tensor deserialized_bcsr_row_block_indices_; + at::Tensor deserialized_bcsr_col_block_indices_; + at::Tensor deserialized_bcsr_weight_values_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override { + TORCH_CHECK( + false, "Static quantized sparse linear unimplemented on QNNPACK"); + } + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override { + TORCH_CHECK( + false, "Static quantized sparse linear unimplemented on QNNPACK"); + } + + at::Tensor apply_dynamic(const at::Tensor& input) override; + at::Tensor apply_dynamic_relu(const at::Tensor& input) override; + + LinearPackedSerializationType unpack() override; + + BCSRSerializationType serialize() override; + + static c10::intrusive_ptr deserialize( + const BCSRSerializationType& serialized); + + std::optional bias() override { + return orig_bias_; + } + + static c10::intrusive_ptr prepack( + const at::Tensor& weight, + const std::optional& bias, + const int64_t out_features_block_size, + const int64_t in_features_block_size); + + private: + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + template + at::Tensor apply_dynamic_impl(const at::Tensor& input); +}; + +} // namespace ao::sparse + +#endif // USE_PYTORCH_QNNPACK + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h new file mode 100644 index 0000000000000000000000000000000000000000..1c10579e97b97a90e215e764777c120763df1c09 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/AtomicAddFloat.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#ifndef ATOMIC_ADD_FLOAT +#define ATOMIC_ADD_FLOAT + +#if (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)) +#include +#else +#define _mm_pause() +#endif + +#include + +static inline void cpu_atomic_add_float(float* dst, float fvalue) +{ + typedef union { + unsigned intV; + float floatV; + } uf32_t; + + uf32_t new_value, old_value; + std::atomic* dst_intV = (std::atomic*)dst; + + old_value.floatV = *dst; + new_value.floatV = old_value.floatV + fvalue; + + unsigned* old_intV = &old_value.intV; + while (!std::atomic_compare_exchange_strong(dst_intV, old_intV, new_value.intV)) { +#ifdef __aarch64__ + __asm__ __volatile__("yield;" : : : "memory"); +#else + _mm_pause(); +#endif + old_value.floatV = *dst; + new_value.floatV = old_value.floatV + fvalue; + } +} + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..242fdcbdd5eeaa2e893cad9dcc35a9cfe57d2183 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ChannelShuffleKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using channel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t); +DECLARE_DISPATCH(channel_shuffle_fn, channel_shuffle_kernel) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..58cc1809d43f8a937d2485fd40070e723f39db48 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/DepthwiseConvKernel.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +/* + Depthwise 3x3 Winograd convolution operator +*/ + +namespace at { +class Tensor; + +namespace native { + +using convolution_depthwise3x3_winograd_fn = + Tensor (*)(const Tensor &, const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, int64_t); + +DECLARE_DISPATCH(convolution_depthwise3x3_winograd_fn, convolution_depthwise3x3_winograd_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Elu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Elu.h new file mode 100644 index 0000000000000000000000000000000000000000..a371690139f4b05463eb97488e3b7e655a977601 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Elu.h @@ -0,0 +1,79 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to +// access constants such as M_SQRT2 and M_2_SQRTPI. +#ifdef _WIN32 +#define _USE_MATH_DEFINES +#include +#endif // _WIN32 + +#include +#include // For c10::is_reduced_floating_point_v. + +namespace at::native { +inline namespace CPU_CAPABILITY { +/** + * Return a function object that calculates ELU with the given + * parameters on its input element. ParamT is the type of the input + * and output to the ELU, and MathT is the type (possibly + * higher-precision, e.g. float if ParamT is reduced-precision float) + * in which to do intermediate calculations. + */ +template +auto get_scalar_elu_elementwise_func(MathT alpha, MathT scale, MathT input_scale) { + const auto negcoef = alpha * scale; + const auto poscoef = scale; + const auto negiptcoef = input_scale; + return [negcoef, negiptcoef, poscoef](ParamT a) -> ParamT { + return MathT(a) < MathT(0) + ? std::expm1(MathT(a) * negiptcoef) * negcoef + : MathT(a) * poscoef; + }; +} + +/** + * Return a function object that calculates ELU with the given + * parameters on its input element. The function object takes and + * returns Vectorized. + */ +template , bool> = true> +auto get_vectorized_elu_elementwise_func(T alpha, T scale, T input_scale) { + const vec::Vectorized negcoef_vec(alpha * scale); + const vec::Vectorized poscoef_vec(scale); + const vec::Vectorized negiptcoef_vec(input_scale); + const vec::Vectorized zero_vec(static_cast(0)); + return [negcoef_vec, poscoef_vec, negiptcoef_vec, zero_vec](vec::Vectorized a) -> vec::Vectorized { + const auto cmp = a >= zero_vec; + if (!cmp.zero_mask()) { + return a * poscoef_vec; + } else { + return vec::Vectorized::blendv((a * negiptcoef_vec).expm1() * negcoef_vec, a * poscoef_vec, cmp); + } + }; +} + +/** + * Return a function object that calculates ELU with the given + * parameters on its input element. The function object takes and + * returns Vectorized, and Vectorized is the type + * (possibly higher-precision) in which to do intermediate + * calculations. + */ +template , bool> = true> +auto get_vectorized_elu_elementwise_func(float alpha, float scale, float input_scale) { + // Takes float->float. + const auto float_func = get_vectorized_elu_elementwise_func(alpha, scale, input_scale); + return [float_func](vec::Vectorized a) -> vec::Vectorized { + auto [a0, a1] = vec::convert_to_float(a); + auto res0 = float_func(a0); + auto res1 = float_func(a1); + return vec::convert_from_float(res0, res1); + }; +} +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..657e54f03a7255746b63f5f7447d8c6e30fd5a70 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using forward_2d_fn = void (*) ( + const TensorBase &output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners); +using backward_2d_fn = void (*) ( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask); +DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel) +DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..3c4e71dc5f1e101980fcedc45e591eaff3bd10d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h @@ -0,0 +1,90 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +inline bool is_constant_index(int ntensor, const int64_t* strides) { + AT_ASSERT(ntensor >= 3); + for (const auto arg : c10::irange(2, ntensor)) { + if (strides[arg] != 0) { + return false; + } + } + return true; +} + + +struct Indexer { + Indexer(int64_t num_indexers, char** indexers, const int64_t* indexer_strides, + IntArrayRef original_sizes, IntArrayRef original_strides) + : num_indexers(num_indexers) + , indexers(indexers) + , indexer_strides(indexer_strides) + , original_strides(original_strides.data()) + , original_sizes(original_sizes.data()) { + AT_ASSERT(static_cast(original_strides.size()) == num_indexers); + AT_ASSERT(static_cast(original_sizes.size()) == num_indexers); + } + + int64_t num_indexers; + char** indexers; + const int64_t* indexer_strides; + const int64_t* original_strides; + const int64_t* original_sizes; + + int64_t get(int64_t idx) { + int64_t offset = 0; + for (const auto j : c10::irange(num_indexers)) { + int64_t value = *(int64_t*)&indexers[j][idx * indexer_strides[j]]; + int64_t size = original_sizes[j]; + TORCH_CHECK_INDEX(value >= -size && value < size, + "index ", value, " is out of bounds for dimension ", j, " with size ", size); + if (value < 0) { + value += size; + } + offset += value * original_strides[j]; + } + return offset; + } +}; + +template +void cpu_index_kernel(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride, + const func_t& f, bool serial_execution=false) +{ + int ntensor = iter.ntensors(); + // When launch the index parallel version, set a relative small grain size less than the INTERNAL::GRAIN_SIZE + // to make the whole available thread numbers get more balanced work load and a better cache location. + // The grain size here is chosen by the op benchmark to overcome the thread launch overhead + const int index_parallel_grain_size = 3000; + auto loop = [&](char** data, const int64_t* strides, int64_t n) { + auto indexer = Indexer(ntensor - 2, &data[2], &strides[2], index_size, index_stride); + char* dst = data[0]; + char* src = data[1]; + if (is_constant_index(ntensor, strides)) { + // specialization for when every element uses the same index + int64_t offset = indexer.get(0); + for (const auto i : c10::irange(n)) { + f(dst + strides[0] * i, src + strides[1] * i, offset); + } + } else { + for (const auto i : c10::irange(n)) { + int64_t offset = indexer.get(i); + f(dst + strides[0] * i, src + strides[1] * i, offset); + } + } + }; + if (serial_execution) { + iter.serial_for_each(loop, {0, iter.numel()}); + } else { + iter.for_each(loop, index_parallel_grain_size); + } +} +} // at +// native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Intrinsics.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..760927f022034d5b7ae136ece8afac3cf9f1d4d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Intrinsics.h @@ -0,0 +1,38 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) +/* Clang-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(_MSC_VER) +/* Microsoft C/C++-compatible compiler */ +#include +#if _MSC_VER <= 1900 +#define _mm256_extract_epi64(X, Y) (((uint64_t*)&X)[Y]) +#endif +#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif (defined(__GNUC__) || defined(__xlC__)) && \ + (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +/* We need to undef those tokens defined by to avoid conflicts + with the C++ types. => Can still use __bool/__vector */ +#undef bool +#undef vector +#undef pixel +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IsContiguous.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IsContiguous.h new file mode 100644 index 0000000000000000000000000000000000000000..caf687c87dfc9820600bd4c581a0125acb821e82 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/IsContiguous.h @@ -0,0 +1,69 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::native { inline namespace CPU_CAPABILITY { + +// n: number of function arguments (arity) +// traits: function_traits (see FunctionTraits.h) +// s: index of scalar argument or -1 +template +struct IsContiguous { + static bool eval(const int64_t* strides) { + using type = typename traits::template arg::type; + return strides[stride_index] == (s == n ? 0 : sizeof(type)) && + IsContiguous::eval(strides); + } +}; + +// will be called when there is an output exists +template +struct IsContiguous<0, 0, traits, s> { + static bool eval(const int64_t* strides) { + return strides[0] == sizeof(typename traits::result_type); + } +}; + +// will be called when there is no output +template +struct IsContiguous<0, -1, traits, s> { + static bool eval(const int64_t* /*strides*/) { + return true; + } +}; + +// output and all inputs are contiguous +template < + typename traits, + std::enable_if_t>* = + nullptr> +static inline bool is_contiguous(const int64_t* strides) { + return IsContiguous::eval(strides); +} + +template >* = nullptr> +static inline bool is_contiguous(const int64_t* strides) { + return IsContiguous::eval(strides); +} + +// input at `s` is scalar (stride 0); output and other inputs are contiguous +// NB: output is typically at strides[0] so first input corresponds to s=1 +template >* = nullptr> +static inline bool is_contiguous_scalar(const int64_t* strides) { + static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds"); + return IsContiguous::eval(strides); +} + +template >* = nullptr> +static inline bool is_contiguous_scalar(const int64_t* strides) { + static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds"); + return IsContiguous::eval(strides); +} + +}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogSoftmaxKernelImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogSoftmaxKernelImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..64c68e9f263bdb816f39bafdff54ea9d3b3a8ef3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/LogSoftmaxKernelImpl.h @@ -0,0 +1,342 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { +template +int64_t vec_log_softmax_lastdim_chunk_size(int64_t grain_size, int64_t outer_size, int64_t dim_size) { + // Coincidentally, at::internal::GRAIN_SIZE is 32768, which is equal to the + // size of L1D cache on many processors. Some processors have 48 KB L1D cache + // nowadays, so maybe in the future, we can leverage the knowledge of a + // machine's L1D cache size. + int64_t MAX_CHUNK_SIZE = std::max( + 1, + grain_size / (sizeof(scalar_t) * dim_size)); + return std::min(MAX_CHUNK_SIZE, outer_size); +} + +template +void serial_vec_log_softmax_lastdim_range( + const scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t dim_size, + int64_t chunk_size, + int64_t begin, + int64_t end) { + if (end <= begin) { + return; + } + using Vec = vec::Vectorized>; + // MSVC requires such a declaration of dynamic arrays + // Source: https://stackoverflow.com/a/33423538 + auto tmp_sum_scalar = std::make_unique(chunk_size); + auto max_input_arr = std::make_unique(chunk_size); + for (int64_t ii = begin; ii < end; ii += chunk_size) { + int64_t loop_end = chunk_size; + if (ii + chunk_size > end) { + loop_end = end - ii; + } + for (const auto j : c10::irange(loop_end)) { + int64_t i = ii + j; + const scalar_t* input_data = input_data_base + i * dim_size; + max_input_arr[j] = vec::reduce_all( + [](Vec& x, Vec& y) { return vec::maximum(x, y); }, + input_data, + dim_size); + } + for (const auto j : c10::irange(loop_end)) { + int64_t i = ii + j; + const scalar_t* input_data = input_data_base + i * dim_size; + scalar_t max_input = max_input_arr[j]; + tmp_sum_scalar[j] = vec::map_reduce_all( + [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, + [](Vec x, Vec y) { return x + y; }, + input_data, + dim_size); + } + // See [Note AVX-SSE transitions] for why this should call the + // vectorized version (aside from perf improvements). + vec::map( + [](Vec x) { return x.log(); }, + tmp_sum_scalar.get(), + tmp_sum_scalar.get(), + loop_end); + for (const auto j : c10::irange(loop_end)) { + int64_t i = ii + j; + const scalar_t* input_data = input_data_base + i * dim_size; + scalar_t* output_data = output_data_base + i * dim_size; + scalar_t tmp_sum = tmp_sum_scalar[j]; + scalar_t max_input = max_input_arr[j]; + + // It's necessary to keep the order of the operations below. + // In some cases that input is large digits and the difference + // is small, if we compute `max_input` plus `tmp_sum` before, + // there would be a numerical problem. See an example in + // https://github.com/pytorch/pytorch/issues/11752#issuecomment-422883379 + vec::map( + [tmp_sum, max_input](Vec x) { + return x - Vec(max_input) - Vec(tmp_sum); + }, + output_data, + input_data, + dim_size); + } + } +} + +// Can't include ATen/Parallel.h. +// TODO: find a way to have only one copy of divup. +inline int64_t divup(int64_t x, int64_t y) { + return (x + y - 1) / y; +} + +template +std::pair vec_logsoftmax_chunk_size_and_num_chunks(int64_t inner_size, int64_t dim_size) { + using Vec = vec::Vectorized; + int64_t MAX_CHUNK_SIZE = std::max(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); + int64_t num_chunks = divup(inner_size, CHUNK_SIZE); + return {CHUNK_SIZE, num_chunks}; +} + +template +std::enable_if_t>, void> +serial_vec_logsoftmax_range( + const scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t inner_size, + int64_t chunk_size, + int64_t num_chunks, + int64_t dim_size, + int64_t begin, + int64_t end) { + using Vec = vec::Vectorized; + // thread local temp buffer which holds vertical reduction result: max and sum. + auto buffer = std::make_unique(chunk_size * 2); + scalar_t* input_max_data = buffer.get(); + scalar_t* tmp_sum_data = buffer.get() + chunk_size; + + for (int64_t i = begin; i < end; i++) { + int64_t outer_idx = i / num_chunks; + int64_t k = i % num_chunks; + int64_t inner_idx_begin = k * chunk_size; + int64_t size = std::min(chunk_size, inner_size - inner_idx_begin); + + // init + Vec zero_vec = Vec(scalar_t(0)); + Vec min_vec = Vec(-std::numeric_limits::infinity()); + int64_t d0 = 0; + for (; d0 < size - (size % Vec::size()); d0 += Vec::size()) { + min_vec.store(input_max_data + d0); + zero_vec.store(tmp_sum_data + d0); + } + for (; d0 < size; d0++) { + input_max_data[d0] = -std::numeric_limits::infinity(); + tmp_sum_data[d0] = scalar_t(0); + } + + // compute max + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + + int64_t d1 = 0; + for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d1); + Vec max_vec = Vec::loadu(input_max_data + d1); + max_vec = Vec::blendv(max_vec, data_vec, data_vec > max_vec); + max_vec.store(input_max_data + d1); + } + for (; d1 < size; d1++) { + scalar_t data_val = input_ptr[d1]; + scalar_t max_val = input_max_data[d1]; + input_max_data[d1] = data_val > max_val ? data_val : max_val; + } + } + + // compute sum of (x - max).exp() + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + + int64_t d2 = 0; + for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d2); + Vec sum_vec = Vec::loadu(tmp_sum_data + d2); + Vec max_vec = Vec::loadu(input_max_data + d2); + sum_vec += (data_vec - max_vec).exp(); + sum_vec.store(tmp_sum_data + d2); + } + for (; d2 < size; d2++) { + scalar_t data_val = input_ptr[d2]; + scalar_t max_val = input_max_data[d2]; + tmp_sum_data[d2] += std::exp(data_val - max_val); + } + } + + // apply log + vec::map([](Vec x) { return x.log(); }, tmp_sum_data, tmp_sum_data, size); + + // compute x - max - sum + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + int64_t offset = outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin; + const scalar_t* input_ptr = input_data_base + offset; + scalar_t* output_ptr = output_data_base + offset; + + int64_t d3 = 0; + for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d3); + Vec max_vec = Vec::loadu(input_max_data + d3); + Vec sum_vec = Vec::loadu(tmp_sum_data + d3); + Vec out_vec = data_vec - max_vec - sum_vec; + out_vec.store(output_ptr + d3); + } + for (; d3 < size; d3++) { + output_ptr[d3] = input_ptr[d3] - input_max_data[d3] - tmp_sum_data[d3]; + } + } + } +} + +template +std::enable_if_t>, void> +serial_vec_logsoftmax_range( + const scalar_t* input_data_base, + scalar_t* output_data_base, + int64_t inner_size, + int64_t chunk_size, + int64_t num_chunks, + int64_t dim_size, + int64_t begin, + int64_t end) { + using Vec = vec::Vectorized; + using fVec = vec::Vectorized; + auto buffer = std::make_unique(chunk_size * 2); + float* input_max_data = buffer.get(); + float* tmp_sum_data = buffer.get() + chunk_size; + + // thread local buffer that holds input data in float32 to save next 2 dtype conversion + auto input_buffer = std::make_unique(dim_size * chunk_size); + float* input_buffer_data = input_buffer.get(); + + // init + for (int64_t i = begin; i < end; i++) { + int64_t outer_idx = i / num_chunks; + int64_t k = i % num_chunks; + int64_t inner_idx_begin = k * chunk_size; + int64_t size = std::min(chunk_size, inner_size - inner_idx_begin); + + fVec zero_fvec = fVec(float(0)); + fVec min_fvec = fVec(-std::numeric_limits::infinity()); + int64_t d0 = 0; + for (; d0 < size - (size % Vec::size()); d0 += Vec::size()) { + min_fvec.store(input_max_data + d0); + min_fvec.store(input_max_data + d0 + fVec::size()); + zero_fvec.store(tmp_sum_data + d0); + zero_fvec.store(tmp_sum_data + d0 + fVec::size()); + } + for (; d0 < size; d0++) { + input_max_data[d0] = -std::numeric_limits::infinity(); + tmp_sum_data[d0] = float(0); + } + + // compute max + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size; + + int64_t d1 = 0; + for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { + Vec data_vec = Vec::loadu(input_ptr + d1); + auto [data_fvec0, data_fvec1] = vec::convert_to_float(data_vec); + fVec max_fvec0 = fVec::loadu(input_max_data + d1); + fVec max_fvec1 = fVec::loadu(input_max_data + d1 + fVec::size()); + max_fvec0 = fVec::blendv(max_fvec0, data_fvec0, data_fvec0 > max_fvec0); + max_fvec1 = fVec::blendv(max_fvec1, data_fvec1, data_fvec1 > max_fvec1); + max_fvec0.store(input_max_data + d1); + max_fvec1.store(input_max_data + d1 + fVec::size()); + + // cache the 'converted' float input + data_fvec0.store(input_buffer_ptr + d1); + data_fvec1.store(input_buffer_ptr + d1 + fVec::size()); + } + for (; d1 < size; d1++) { + float data_val = float(input_ptr[d1]); + float max_val = input_max_data[d1]; + input_max_data[d1] = data_val > max_val ? data_val : max_val; + input_buffer_ptr[d1] = data_val; + } + } + + // compute sum of (x - max).exp() + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size; + + int64_t d2 = 0; + for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) { + fVec data_fvec0 = fVec::loadu(input_buffer_ptr + d2); + fVec data_fvec1 = fVec::loadu(input_buffer_ptr + d2 + fVec::size()); + fVec sum_fvec0 = fVec::loadu(tmp_sum_data + d2); + fVec sum_fvec1 = fVec::loadu(tmp_sum_data + d2 + fVec::size()); + fVec max_fvec0 = fVec::loadu(input_max_data + d2); + fVec max_fvec1 = fVec::loadu(input_max_data + d2 + fVec::size()); + sum_fvec0 += (data_fvec0 - max_fvec0).exp(); + sum_fvec1 += (data_fvec1 - max_fvec1).exp(); + sum_fvec0.store(tmp_sum_data + d2); + sum_fvec1.store(tmp_sum_data + d2 + fVec::size()); + } + for (; d2 < size; d2++) { + float data_val = input_buffer_ptr[d2]; + float max_val = input_max_data[d2]; + tmp_sum_data[d2] += std::exp(data_val - max_val); + } + } + + // apply log + vec::map([](fVec x) { return x.log(); }, tmp_sum_data, tmp_sum_data, size); + + // compute x - max - sum + for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { + float* input_buffer_ptr = input_buffer_data + dim_idx * chunk_size; + scalar_t* output_ptr = output_data_base + outer_idx * dim_size * inner_size + + dim_idx * inner_size + inner_idx_begin; + + int64_t d3 = 0; + for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) { + fVec data_fvec0 = fVec::loadu(input_buffer_ptr + d3); + fVec data_fvec1 = fVec::loadu(input_buffer_ptr + d3 + fVec::size()); + fVec max_fvec0 = fVec::loadu(input_max_data + d3); + fVec max_fvec1 = fVec::loadu(input_max_data + d3 + fVec::size()); + fVec sum_fvec0 = fVec::loadu(tmp_sum_data + d3); + fVec sum_fvec1 = fVec::loadu(tmp_sum_data + d3 + fVec::size()); + fVec out_fvec0 = data_fvec0 - max_fvec0 - sum_fvec0; + fVec out_fvec1 = data_fvec1 - max_fvec1 - sum_fvec1; + Vec out_vec = vec::convert_from_float(out_fvec0, out_fvec1); + out_vec.store(output_ptr + d3); + } + for (; d3 < size; d3++) { + output_ptr[d3] = scalar_t(input_buffer_ptr[d3] - input_max_data[d3] - tmp_sum_data[d3]); + } + } + } +} // namespace CPU_CAPABILITY +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Loops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Loops.h new file mode 100644 index 0000000000000000000000000000000000000000..b692065e32066ac9831af81ef5946ce4a9bb858d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/Loops.h @@ -0,0 +1,400 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// This file provides two functions to help write elementwise kernels: +// +// cpu_kernel(TensorIterator iter, ) +// cpu_kernel_vec(TensorIterator iter, , ) +// +// Both functions may generate vectorized code. The cpu_kernel implementation +// relies on the compiler's auto-vectorization. The cpu_kernel_vec +// implementation uses x86 SIMD intrinsics when available. These functions +// are only intended to be used in the ATen/native/cpu subdirectory, since files +// in other directories are not compiled with AVX/AVX2 enabled. See README.md +// for more details. +// +// For example, to write a multiplication kernel for float: +// +// cpu_kernel(iter, [](float a, float b) { return a * b; }); +// +// Or you may write: +// +// cpu_kernel_vec(iter, +// [](float a, float b) { return a * b; }, +// [](Vectorized a, Vectorized b) { return a * b; }); +// +// See BinaryOpsKernel.cpp for the complete implementation +// +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at::native { inline namespace CPU_CAPABILITY { + +using namespace vec; + +template +typename traits::ArgsTuple +dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, + std::index_sequence /*unused*/) { + return std::make_tuple( + c10::load::type>( + data[INDEX] + i * strides[INDEX])...); +} + +template +typename traits::ArgsTuple +dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) { + using Indices = std::make_index_sequence; + return dereference_impl(data, strides, i, Indices{}); +} + +template +typename traits::ArgsTuple +dereference_vec_impl(char* C10_RESTRICT data[], + const typename traits::result_type& opt_scalar, + size_t S, + int64_t i, + std::index_sequence /*unused*/) { + using Vec = typename traits::result_type; + using scalar_t = typename Vec::value_type; + return std::make_tuple( + S == INDEX + 1 ? + opt_scalar : + Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...); +} + +template +typename traits::ArgsTuple +dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) { + using Indices = std::make_index_sequence; + return dereference_vec_impl(data, opt_scalar, S, i, Indices{}); +} + +template ::result_type>>* = nullptr> +inline void +execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + using result_type = typename traits::result_type; + for (; i < n; i++) { + result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); + *out_ptr = std::apply(op, dereference( + &data[1], + &strides[1], + i)); + } +} + +template ::result_type>>* = nullptr> +inline void +execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + for (; i < n; i++) { + std::apply(op, dereference( + &data[0], + &strides[0], + i)); + } +} + +// Basic loop operation (one output, N inputs). May be auto-vectorized +// by the compiler. Supports inputs and outputs of different types. +template +inline void +basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + constexpr int ntensors = traits::arity + 1; + + // Copying strides to temporary array helps auto vectorization in older GCC + // versions. + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = strides_[arg]; + } + + execute_op(data, strides, i, n, std::forward(op)); +} + +// the recursive variadic template for iterating over the returned tuple +template +struct TupleOutput { + static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, + const T &tuple) { + TupleOutput::handle(data, strides, i, tuple); + + auto output = std::get(tuple); + using output_type = decltype(output); + output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]); + *out_ptr = output; + } +}; + +// Base case for the above recursive template +template +struct TupleOutput { + static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, + const T &tuple) { + auto output = std::get<0>(tuple); + using output_type = decltype(output); + output_type* out_ptr = (output_type *)(data[0] + i * strides[0]); + *out_ptr = output; + } +}; + +template +void handle_tuple_outputs(char* C10_RESTRICT data[], + const int64_t* strides, + int64_t i, + const std::tuple &tuple) { + TupleOutput::handle(data, strides, i, tuple); +} + +// Loop operation for `cpu_kernel_multiple_outputs`. +// 1. Use `std::apply` to make dynamic method invocation +// for the lambda passed in `cpu_kernel_multiple_outputs`. +// 2. Iterate over the members of the returned tuple, set the corresponding +// output tensor by the tuple member in `handle_tuple_outputs` function. +template +inline void +multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + + using result_type = typename traits::result_type; + constexpr int num_outputs = std::tuple_size_v; + constexpr int ntensors = traits::arity + num_outputs; + + // Copying strides to temporary array helps auto vectorization in older GCC + // versions. + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = strides_[arg]; + } + + for (; i < n; i++) { + auto output = std::apply(op, dereference( + &data[num_outputs], + &strides[num_outputs], + i)); + handle_tuple_outputs(data, strides, i, output); + } +} + +// Explicitly vectorized loop implementation. All inputs and outputs must be +// the same type and contiguous with one exception: a single input may be +// a scalar (stride 0). It's position is indicated by the argument `S`. If `S` +// is 0, then there are no scalar inputs. +template +inline void +vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) { + using traits = function_traits; + using scalar_t = typename function_traits::result_type; + using Vec = Vectorized; + constexpr int ntensors = traits::arity + 1; + + char* C10_RESTRICT data[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + data[arg] = data_[arg]; + } + + Vec opt_scalar = Vec(S > 0 ? c10::load((scalar_t*)data[S]) : scalar_t(0)); + int64_t i = 0; + for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { + auto args1 = dereference_vec(&data[1], opt_scalar, S, i); + auto args2 = dereference_vec(&data[1], opt_scalar, S, i + Vec::size()); + auto out1 = std::apply(vop, std::move(args1)); + auto out2 = std::apply(vop, std::move(args2)); + out1.store(data[0] + i * sizeof(scalar_t)); + out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); + } + if (i < n) { + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t); + } + basic_loop(data, strides, i, n, std::forward(op)); + } +} + + +template +inline void unroll_contiguous_scalar_checks( + const int64_t* /*strides*/, + std::index_sequence<> /*unused*/, + cb_t&& cb) { + cb(0); +} + +template +inline void unroll_contiguous_scalar_checks( + const int64_t* strides, + std::index_sequence /*unused*/, + cb_t&& cb) { + if (is_contiguous_scalar(strides)) { + cb(INDEX0 + 1); + } else { + unroll_contiguous_scalar_checks(strides, std::index_sequence{}, std::forward(cb)); + } +} + +template +struct VectorizedLoop2d { + op_t op; + vop_t vop; + + using traits = function_traits; + static constexpr int ntensors = traits::arity + 1; + using data_t = std::array; + + VectorizedLoop2d(op_t op, vop_t vop): + op(std::move(op)), vop(std::move(vop)) {} + + static void advance(data_t &data, const int64_t *outer_strides) { + for (const auto arg : c10::irange(data.size())) { + data[arg] += outer_strides[arg]; + } + } + + void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) { + data_t data; + std::copy_n(base, ntensors, data.data()); + const int64_t *outer_strides = &strides[ntensors]; + + if (is_contiguous(strides)) { + for ([[maybe_unused]] const auto i : c10::irange(size1)) { + vectorized_loop(data.data(), size0, 0, op, vop); + advance(data, outer_strides); + } + } else { + using Indices = std::make_index_sequence; + unroll_contiguous_scalar_checks(strides, Indices{}, [&](size_t idx) { + if (idx) { + for ([[maybe_unused]] const auto i : c10::irange(size1)) { + vectorized_loop(data.data(), size0, idx, op, vop); + advance(data, outer_strides); + } + } else { + for ([[maybe_unused]] const auto i : c10::irange(size1)) { + basic_loop(data.data(), strides, 0, size0, op); + advance(data, outer_strides); + } + } + }); + } + } +}; + +template +VectorizedLoop2d make_vectorized_loop2d( + op_t &&op, vop_t &&vop) { + return VectorizedLoop2d(std::forward(op), std::forward(vop)); +} + +template +void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.for_each([&](char** data, const int64_t* strides, int64_t n) { + // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that + // iter.for_each is ever sending to the loop lambda + basic_loop(data, strides, 0, n, op); + }, grain_size); + iter.cast_outputs(); +} + +// This function helps write elementwise kernels that requires multiple outputs. +// It follows the similar structure of cpu_kernel. +// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is +// manipulated to handle multiple return values. +// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`) +// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`. +// The `gpu_kernel_multiple_outputs` is also implemented without this check, +// We could extend `needs_dynamic_casting` to support both `std::tuple` and +// `thrust::tuple` in the future. +template +void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + + iter.for_each([&](char** data, const int64_t* strides, int64_t n) { + multiple_outputs_loop(data, strides, 0, n, op); + }, grain_size); + iter.cast_outputs(); +} + +template +void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU, but some kernels (like Fill) + // explicitly dynamic_cast, so we give the opt-out of checking. + if constexpr (check_dynamic_cast) { + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + } + + iter.for_each(make_vectorized_loop2d(std::forward(op), std::forward(vop)), grain_size); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) { + using traits = function_traits; + constexpr bool result_void = std::is_void_v; + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity && + ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1))); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) { + basic_loop(data, strides, 0, n, op); + }, range); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) { + cpu_serial_kernel(iter, std::forward(op), {0, iter.numel()}); +} + +template +void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.serial_for_each(make_vectorized_loop2d(std::forward(op), std::forward(vop)), range); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) { + cpu_serial_kernel_vec(iter, std::forward(op), std::forward(vop), {0, iter.numel()}); +} + +}} // namespace at::native:: + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..1f43c4ee1ee3a3e9a220b4032f5b9bd945c846ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/MaxUnpoolKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class Tensor; + +namespace native { + +using max_unpooling_fn = void(*)(Tensor&, const Tensor&, const Tensor&); + +DECLARE_DISPATCH(max_unpooling_fn, max_unpool2d_kernel) +DECLARE_DISPATCH(max_unpooling_fn, max_unpool3d_kernel) + +}} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..760d9af1c2731eab1111faa7cfadbaed38317f17 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +using pixel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t); +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_shuffle_kernel) +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_unshuffle_kernel) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..ea176a6c61667c37332a4b28b1734b605bb93c56 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h @@ -0,0 +1,242 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +using namespace vec; + +#define AT_DISPATCH_REDUCTION_TYPES(op, ...) \ + [&] { \ + switch (op) { \ + case ReductionType::SUM: { \ + static constexpr auto reduce = ReductionType::SUM; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MEAN: { \ + static constexpr auto reduce = ReductionType::MEAN; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MIN: { \ + static constexpr auto reduce = ReductionType::MIN; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MAX: { \ + static constexpr auto reduce = ReductionType::MAX; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::PROD: { \ + static constexpr auto reduce = ReductionType::PROD; \ + return __VA_ARGS__(); \ + } \ + } \ + }() + +template +inline vec_scalar_t init_value() { + using acc_t = vec_scalar_t; + acc_t val; + if (reduce == ReductionType::SUM || + reduce == ReductionType::MEAN) { + val = static_cast(0); + } else if (reduce == ReductionType::PROD) { + val = static_cast(1); + } else if (reduce == ReductionType::MAX) { + val = -std::numeric_limits::infinity(); + } else { + TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN); + val = std::numeric_limits::infinity(); + } + return val; +} + +template +inline vec_scalar_t init_value(const std::optional& initial) { + using acc_t = vec_scalar_t; + if (initial.has_value()) { + return initial.value().to(); + } else { + return init_value(); + } +} + +template +inline void init(scalar_t* out, int64_t size, const vec_scalar_t& val) { + using Vec = Vectorized>; + map( + [val](Vec x) { return Vec(val); }, + out, + out, + size); +} + +template +inline void init(scalar_t* out, int64_t size, const std::optional& initial) { + using acc_t = vec_scalar_t; + acc_t val = init_value(initial); + init(out, size, val); +} + +// overload with `include_self`, used by scatter_reduce +template +inline void init(scalar_t* out, int64_t size, bool include_self = false) { + using acc_t = vec_scalar_t; + if (!include_self) { + acc_t val = init_value(); + init(out, size, val); + } +} + +template +inline void _init(scalar_t* self_ptr, at::opmath_type* buffer_ptr, int64_t size, bool include_self) { + if (!include_self) { + init, reduce>(buffer_ptr, size, include_self); + } else { + vec::convert(self_ptr, buffer_ptr, size); + } +} + +template +inline std::enable_if_t, scalar_t> +_max(const scalar_t& x, const scalar_t& y) { + return at::_isnan(y) ? y : std::max(x, y); +} + +template +inline Vectorized _max(const Vectorized& x, const Vectorized& y) { + // vec::maximum propagates NaN + return vec::maximum(x, y); +} + +template +inline std::enable_if_t, Vec2> +_max(const vec_t& x, const vec_t& y) { + // vec::maximum propagates NaN + return maximum(x, y); +} + +template +inline std::enable_if_t, scalar_t> +_min(const scalar_t& x, const scalar_t& y) { + return at::_isnan(y) ? y : std::min(x, y); +} + +template +inline Vectorized _min(const Vectorized& x, const Vectorized& y) { + // vec::minimum propagates NaN + return vec::minimum(x, y); +} + +template +inline std::enable_if_t, Vec2> +_min(const vec_t& x, const vec_t& y) { + // vec::minimum propagates NaN + return minimum(x, y); +} + +template , int> = 0> +inline void map_acc( + const Op& vec_fun, + accumut* output_data, + const accumut* input_data, + const scalar_t* input_data2, + int64_t size) { + using Vec = vec::Vectorized; + using aVec = vec::Vectorized; + int64_t d = 0; + constexpr int64_t kVecSize = Vec::size(); + constexpr int64_t kaVecSize = aVec::size(); + for (d = 0; d < size - (size % kVecSize); d += kVecSize) { + Vec data2_vec = Vec::loadu(input_data2 + d); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); + aVec input_vec0 = aVec::loadu(input_data + d); + aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize); + vec_fun(input_vec0, data2_avec0).store(output_data + d); + vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize); + } + if (size - d > 0) { + int64_t tail_size = size - d; + Vec data2_vec = Vec::loadu(input_data2 + d, tail_size); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); + if (tail_size > kaVecSize) { + aVec input_vec0 = aVec::loadu(input_data + d); + aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize); + vec_fun(input_vec0, data2_avec0).store(output_data + d); + vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize, tail_size - kaVecSize); + } else { + aVec input_vec0 = aVec::loadu(input_data + d, tail_size); + vec_fun(input_vec0, data2_avec0).store(output_data + d, tail_size); + } + } +} + +// for Max and Min, propagate NaN: +template +inline T update(const T& x, const T& y) { + if (reduce == ReductionType::SUM || + reduce == ReductionType::MEAN) { + return x + y; + } else if (reduce == ReductionType::PROD) { + return x * y; + } else if (reduce == ReductionType::MAX) { + return _max(x, y); + } else { + TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN); + return _min(x, y); + } +} + +template +inline void update(scalar_t* out, const scalar_t* data, int64_t K) { + using Vec = vec::Vectorized>; + map2( + [](Vec x, Vec y) { return update(x, y); }, + out, + out, + data, + K); +} + +template , int> = 0> +inline void update(at::opmath_type* out, const scalar_t* data, int64_t K) { + using opmath_t = at::opmath_type; + using Vec = vec::Vectorized; + map_acc( + [](Vec x, Vec y) { return update(x, y); }, + out, + out, + data, + K); +} + +template +inline void write(scalar_t* out, int64_t count, int64_t K) { + using Vec = vec::Vectorized>; + if (reduce == ReductionType::MEAN) { + if (count > 0) { + vec::map( + [count](Vec x) { return x / Vec(count); }, + out, + out, + K); + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f34904a05d3df09a2cf7360cf047b8f775f792c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { +#if !defined(C10_MOBILE) +using fp16_gemv_fn = void(*)(int, int, float, const Half*, int, const Half*, int, float, Half*, int); +DECLARE_DISPATCH(fp16_gemv_fn, fp16_gemv_trans_stub) + +using bf16_gemv_fn = void(*)(int, int, BFloat16, const BFloat16*, int, const BFloat16*, int, BFloat16, BFloat16*, int); +DECLARE_DISPATCH(bf16_gemv_fn, bf16_gemv_trans_stub) + +using fp16_dot_fn = float(*)(const int64_t, const Half*, const int64_t, const Half*, const int64_t); +DECLARE_DISPATCH(fp16_dot_fn, fp16_dot_stub) + +using bf16_dot_fn = float(*)(const int64_t, const BFloat16*, const int64_t, const BFloat16*, const int64_t); +DECLARE_DISPATCH(bf16_dot_fn, bf16_dot_stub) + +inline namespace CPU_CAPABILITY { +float fp16_dot_with_fp32_arith(const Half* vec1, const Half* vec2, int64_t len); +float bf16_dot_with_fp32_arith(const BFloat16* vec1, const BFloat16* vec2, int64_t len); +} // inline namespace CPU_CAPABILITY +#endif // !defined(C10_MOBILE) +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..964f5aad7a1c4633ee8211ab809e654ec4127108 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +using sampled_addmm_sparse_csr_fn = void(*)(const Tensor&, const Tensor&, const Scalar&, const Scalar&, const Tensor&); + +DECLARE_DISPATCH(sampled_addmm_sparse_csr_fn, sampled_addmm_sparse_csr_stub) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..0f401b131084661922ced32a901a33e0022d6005 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h @@ -0,0 +1,151 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright 2004-present Facebook. All Rights Reserved. +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native::detail { + +struct InputMeta { + void* data_ptr; + int64_t inner_size; + + InputMeta(const Tensor& t, int64_t dim, int64_t inner) + : data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {} +}; + +// This kernel is used by two TensorList types: +// 1. stack_serial_kernel uses at::ArrayRef +// 2. Static runtime calls this kernel directly (csrc/jit/runtime/static/ops.cpp) with +// ProcessedNodeInputWrapper. +// When making changes, make sure that they are compatible with both types! +template +void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + dim >= 0 && dim <= result.dim(), + "dim out of range in stack_serial_kernel_impl"); + int64_t outer = + result.numel() / (result.sizes()[dim] * result.strides()[dim]); + scalar_t* result_data = result.data_ptr(); + int64_t ninputs = tensors.size(); + std::vector inputs; + inputs.reserve(ninputs); + for (const auto& tensor : tensors) { + inputs.emplace_back(tensor, dim, tensor.strides()[dim]); + } + + using Vec = vec::Vectorized; + scalar_t* result_ptr = result_data; + for (const auto i : c10::irange(outer)) { + for (const auto j : c10::irange(ninputs)) { + int64_t local_inner = inputs[j].inner_size; + scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner; + + if (local_inner < Vec::size()) { + for (const auto k : c10::irange(local_inner)) { + result_ptr[k] = input_ptr[k]; + } + } else { + vec::map( + [](Vec x) { return x; }, result_ptr, input_ptr, local_inner); + } + result_ptr += local_inner; + } + } +} + +// Checks to see whether native stack can be invoked under these conditions: +// - result and input tensors are contiguous +// - only one thread is used +// - no type promotion has to occur +// - tensors dtype is Double or Float +template +bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) { + TORCH_CHECK(!tensors.empty(), "expected a non-empty list of Tensors"); + const Tensor& first_tensor = tensors[0]; + // stack dimension should be in range [0,firstTensor.dim()) + // dim == firstTensor.dim() is a valid input, but it is handled by default code path + // that uses unsqueeze + if (dim >= first_tensor.dim()) return false; + // Native stack doesn't apply any tensor is skipped. + if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false; + // there should be no type promotion + if (result.dtype() != first_tensor.dtype()) return false; + + auto first_tensor_mem_format = first_tensor.suggest_memory_format(); + ScalarType dtype = first_tensor.scalar_type(); + + if (!result.is_contiguous(first_tensor_mem_format)) { + return false; + } + + // fast path only works for Double and Float + if (dtype != ScalarType::Double && dtype != ScalarType::Float) { + return false; + } + + // check remainder of inputs +#ifndef STRIP_ERROR_MESSAGES + auto const &first_tensor_shape = first_tensor.sizes(); +#endif + for (const auto i : c10::irange(1, tensors.size())) { + auto const &tensor = tensors[i]; + TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(), + "stack expects each tensor to be equal size, but got ", first_tensor_shape, + " at entry 0 and ", tensor.sizes(), " at entry ", i); + + // every tensor must be contiguous + // tensor sizes and strides must be the same + // there should be no type promotion + if (!tensor.is_contiguous(first_tensor_mem_format) || + tensor.strides() != first_tensor.strides() || + tensor.dtype() != dtype) { + return false; + } + } + + // fast native stack should only be used when it is not worth using multiple threads + // or there is only one thread. Note that we aren't checking result.numel() here because + // it may not have been resized and we want to defer that cost till later. + int64_t numel_in_stack = first_tensor.numel() * tensors.size(); + return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1; +} + +template +struct CanUseNativeSerialStack; + +template +struct CanUseNativeSerialStack { + static bool call(Tensor& result, TensorListType tensors, int64_t dim) { + // Inputs cannot alias the output tensor + for (const auto i : c10::irange(tensors.size())) { + auto lap = at::get_overlap_status(result, tensors[i]); + TORCH_CHECK(lap != at::MemOverlapStatus::Partial && + lap != at::MemOverlapStatus::Full, 0, + "unsupported operation: the input tensors cannot refer to any of the " + "output memory locations. Found overlap in input tensor ", i); + } + + return can_use_native_serial_stack_impl(result, tensors, dim); + } +}; + +template +struct CanUseNativeSerialStack { + static bool call(Tensor& result, TensorListType tensors, int64_t dim) { + return can_use_native_serial_stack_impl(result, tensors, dim); + } +}; + +} // namespace at::native::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..35d9d0a33c49adfbfd7b77a957552594c8b51c0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using forward_fn = void (*)(const Tensor&, const Tensor&); +using backward_fn = void(*)(const Tensor &, const Tensor &, const Tensor&); + +DECLARE_DISPATCH(forward_fn, softmax_lastdim_kernel) +DECLARE_DISPATCH(forward_fn, log_softmax_lastdim_kernel) +DECLARE_DISPATCH(backward_fn, softmax_backward_lastdim_kernel) +DECLARE_DISPATCH(backward_fn, log_softmax_backward_lastdim_kernel) + +using forward_fn_with_dim = void(*)(const Tensor &, const Tensor &, const int64_t); +using backward_fn_with_dim = + void (*)(const Tensor&, const Tensor&, const Tensor&, const int64_t); + +DECLARE_DISPATCH(forward_fn_with_dim, softmax_kernel) +DECLARE_DISPATCH(forward_fn_with_dim, log_softmax_kernel) +DECLARE_DISPATCH(backward_fn_with_dim, softmax_backward_kernel) +DECLARE_DISPATCH(backward_fn_with_dim, log_softmax_backward_kernel) +} +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8cca5fdca9252e05f56751278a309a68760c1bb6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/SpmmReduceKernel.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +using spmm_reduce_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_backward_input_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_backward_input_arg_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); +using spmm_reduce_backward_other_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, const Tensor&, ReductionType op); + +DECLARE_DISPATCH(spmm_reduce_fn, spmm_reduce_stub) +DECLARE_DISPATCH(spmm_reduce_arg_fn, spmm_reduce_arg_stub) +DECLARE_DISPATCH(spmm_reduce_backward_input_fn, spmm_reduce_backward_input_stub) +DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_input_arg_stub) +DECLARE_DISPATCH(spmm_reduce_backward_other_fn, spmm_reduce_backward_other_stub) +DECLARE_DISPATCH(spmm_reduce_backward_input_arg_fn, spmm_reduce_backward_other_arg_stub) + +} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/StackKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/StackKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..f30015d64c7f299b65160453ea684d0b45107ce4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/StackKernel.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright 2004-present Facebook. All Rights Reserved. +#pragma once + +#include +#include + +namespace at::native { + +using stack_serial_fn = void(*)(Tensor &, TensorList, int64_t); +DECLARE_DISPATCH(stack_serial_fn, stack_serial_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h new file mode 100644 index 0000000000000000000000000000000000000000..5e30c19287a88e8e335b70798996946a6dd166ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -0,0 +1,1381 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* +The Python Imaging Library (PIL) is + + Copyright © 1997-2011 by Secret Labs AB + Copyright © 1995-2011 by Fredrik Lundh + +Pillow is the friendly PIL fork. It is + + Copyright © 2010-2022 by Alex Clark and contributors + +Like PIL, Pillow is licensed under the open source HPND License +*/ + +// This code is heavily inspired from PILLOW-SIMD's implementation: +// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c + +#pragma once +#ifdef CPU_CAPABILITY_AVX2 +// TODO: This file only supports AVX2. We could split the AVX kernels into +// smaller logical blocks in order to port them into the Vec.h logic. This would +// allow to support other vectorization architectures and perhaps also support +// the non-vectorized fallback (we'd need to make sure it's not slower than the +// current fallback). + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +namespace { + +inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + int32_t v; + if (i32_aligned) { + v = *(const int32_t*)ptr; + } else { + std::memcpy(&v, ptr, 4); + } + return _mm_cvtsi32_si128(v); +} + +inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned)); +} + +inline void _write_endline_rgb_as_uint32( + uint8_t* C10_RESTRICT output, + uint32_t data +) { + // data is (R G B X), output is (X1 X2 X3 | R1 B1 G1 R2 ...) + // Here we explicitly set X as R1 + uint8_t* data_ptr = reinterpret_cast(&data); + data_ptr[3] = output[3]; + std::memcpy(output, data_ptr, 4); +} + +at::Tensor unpack_rgb(const at::Tensor& packed_tensor) { + // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into + // RGBARGBARGBA format where A is hard-coded to 0. Each pixel is encoded + // into as 32 bits. This generalizes to num_channels <= 4 and also works for + // non-channels_last tensors. + + const uint8_t* packed = (const uint8_t*)packed_tensor.const_data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + constexpr int rgba_size = 4; + auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte)); + uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr(); + + auto stride_i = packed_tensor.stride(2); + auto stride_j = packed_tensor.stride(0); + + for (const auto i : c10::irange(num_pixels)) { + for (const auto j : c10::irange(rgba_size)) { + unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0; + } + } + return unpacked_tensor; +} + +void pack_rgb( + const at::Tensor& unpacked_tensor, // IN + const at::Tensor& packed_tensor // OUT +) { + // Convert from unpacked channels last 3-channels or 4-channels tensor into original data layout. + + uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr(); + uint8_t* packed = (uint8_t*)packed_tensor.data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + auto unpacked_increment = unpacked_tensor.size(0); + auto packed_increment = packed_tensor.stride(2); + auto packed_stride = packed_tensor.stride(0); + + TORCH_INTERNAL_ASSERT(unpacked_increment == 3 || unpacked_increment == 4); + + for ([[maybe_unused]] const auto i : c10::irange(num_pixels)) { + for (const auto j : c10::irange(num_channels)) { + packed[j * packed_stride] = unpacked[j]; + } + unpacked += unpacked_increment; + packed += packed_increment; + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint8_t* C10_RESTRICT lineOut0, + uint8_t* C10_RESTRICT lineOut1, + uint8_t* C10_RESTRICT lineOut2, + uint8_t* C10_RESTRICT lineOut3, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn0, + const uint8_t* C10_RESTRICT lineIn1, + const uint8_t* C10_RESTRICT lineIn2, + const uint8_t* C10_RESTRICT lineIn3, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line); + +void ImagingResampleHorizontalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line); + +void ImagingResampleVerticalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + const uint8_t* C10_RESTRICT lineIn, + int64_t xsize, + int64_t ids_min, + int64_t ids_size, + const int16_t* k, + unsigned int coefs_precision, + int64_t num_channels); + +template +void ImagingResampleHorizontal( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& horiz_indices_weights, + unsigned int horiz_weights_precision) { + + // Interpolation horizontal pass: we compute x-axis (image width) interpolation outputs. + + // Input data is stored as + // input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...] + // Weights are float values computed for each output pixel and rescaled to uint16: + // weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]] + // We want to compute the output as following: + // output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...] + // where + // oR[yoffset + i] = r[yoffset + xmin[i]] * w[i, 0] + ... + r[yoffset + xmin[i] + K-1] * w[i, K-1] + // oG[yoffset + i] = g[yoffset + xmin[i]] * w[i, 0] + ... + g[yoffset + xmin[i] + K-1] * w[i, K-1] + // oB[yoffset + i] = b[yoffset + xmin[i]] * w[i, 0] + ... + b[yoffset + xmin[i] + K-1] * w[i, K-1] + // + + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_horizontal) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + + const int16_t* kk = (int16_t*)(horiz_indices_weights[3].const_data_ptr()); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + auto xin = unpacked_input.size(2); + TORCH_INTERNAL_ASSERT(num_channels == unpacked_input.size(0)); + + const int64_t* idx_ptr_xmin = horiz_indices_weights[0].const_data_ptr(); + const int64_t* idx_ptr_size = horiz_indices_weights[1].const_data_ptr(); + + uint8_t* unpacked_output_p = unpacked_output.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr(); + + int64_t yy = 0; + auto xout_stride = xout * num_channels; + auto xin_stride = xin * num_channels; + for (; yy < yout - 3; yy += 4) { + ImagingResampleHorizontalConvolution8u4x( + unpacked_output_p + yy * xout_stride, + unpacked_output_p + (yy + 1) * xout_stride, + unpacked_output_p + (yy + 2) * xout_stride, + unpacked_output_p + (yy + 3) * xout_stride, + xout, + unpacked_input_p + yy * xin_stride, + unpacked_input_p + (yy + 1) * xin_stride, + unpacked_input_p + (yy + 2) * xin_stride, + unpacked_input_p + (yy + 3) * xin_stride, + xin, + idx_ptr_xmin, + idx_ptr_size, + kk, + ksize, + horiz_weights_precision, + num_channels, + yy + 3 == yout - 1); + } + for (; yy < yout; yy++) { + ImagingResampleHorizontalConvolution8u( + unpacked_output_p + yy * xout_stride, + xout, + unpacked_input_p + yy * xin_stride, + xin, + idx_ptr_xmin, + idx_ptr_size, + kk, + ksize, + horiz_weights_precision, + num_channels, + yy == yout - 1); + } +} + +void ImagingResampleVertical( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& vert_indices_weights, + unsigned int vert_weights_precision) { + + // Interpolation vertical pass: we compute y-axis interpolation outputs. + // Input data is stored as + // input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...] + // Weights are float values computed for each output pixel and rescaled to uint16: + // weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]] + // We want to compute the output as following: + // output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...] + // where + // oR[xoffset + i] = r[xoffset + ymin[i]] * w[i, 0] + ... + r[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + // oG[xoffset + i] = g[xoffset + ymin[i]] * w[i, 0] + ... + g[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + // oB[xoffset + i] = b[xoffset + ymin[i]] * w[i, 0] + ... + b[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_vertical) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + const int16_t* kk = (int16_t*)(vert_indices_weights[3].const_data_ptr()); + + const int64_t* idx_ptr_xmin = vert_indices_weights[0].const_data_ptr(); + const int64_t* idx_ptr_size = vert_indices_weights[1].const_data_ptr(); + + uint8_t* unpacked_output_p = unpacked_output.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr(); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + const auto num_channels = unpacked_input.size(0); + TORCH_INTERNAL_ASSERT(num_channels == unpacked_output.size(0)); + + auto xout_stride = xout * num_channels; + for (const auto yy : c10::irange(yout)) { + const auto* k = &kk[yy * ksize]; + auto ids_min = idx_ptr_xmin[yy]; + auto ids_size = idx_ptr_size[yy]; + ImagingResampleVerticalConvolution8u( + unpacked_output_p + yy * xout_stride, + unpacked_input_p, + xout, + ids_min, + ids_size, + k, + vert_weights_precision, + num_channels); + } +} + +// This is the only public entry point in this file. It supports bilinear or bicubic +// mode for uint8 dtype when C <= 4, with or without antialias. The +// implem is based on PIL-SIMD. +// Its equivalent implementation (fallback) for when AVX isn't supported or when +// C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of +// future improvement that can be done: look for the TODOs in this file. +// For details on how the weights are computed and how the multiplications are +// run on int (instead of float weights), see +// [ Weights computation for uint8_t and multiplication trick ] +// For details on how the AVX kernels are implemented, see +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +// See also [ Support for antialias=False as a subcase of antialias=True ] to +// learn more about how the antialias=False case is computed. The same holds +// here: all these kernels are general enough to handle an arbitrary number of +// weights, but when aa=False they could be optimized further. +template +void upsample_avx_bilinear_bicubic_uint8( + const at::Tensor& input_, + const at::Tensor& output, + bool align_corners, + const scale_type& scales, + bool antialias) { + auto batch_size = input_.size(0); + auto num_channels = input_.size(1); + auto xin = input_.size(3); + auto yin = input_.size(2); + auto xout = output.size(3); + auto yout = output.size(2); + + if (xin == xout && yin == yout) { + output.copy_(input_); + return; + } + + at::Tensor input = input_; + if (!(input.is_contiguous() || input.is_contiguous(at::MemoryFormat::ChannelsLast))) { + // If input is not contiguous with memory format channels first or channels last, + // we explicitly convert the input to contiguous channels last memory format. + // This simplifies the rest of the code and let us assume that the format is only contiguous channels first or channels last, + // Most tensors going through this `if` block won't need to go through unpacking, but those having C < 3 may + // have to (this means 2 copies are made). We could avoid the extra copy by handling non-contiguous input + // directly within unpack_rgb() and pack_rgb(), but initial attempts showed that this is fairly complex. + input = input.contiguous(at::MemoryFormat::ChannelsLast); + } + + auto need_horizontal = xout != xin; + auto need_vertical = yout != yin; + + int ksize_horiz, ksize_vert; + std::vector horiz_indices_weights, vert_indices_weights; + unsigned int horiz_weights_precision, vert_weights_precision; + + bool skip_unpacking = (num_channels == 3 || num_channels == 4) && input.is_contiguous(at::MemoryFormat::ChannelsLast); + bool skip_packing = (num_channels == 3 || num_channels == 4) && output.is_contiguous(at::MemoryFormat::ChannelsLast); + + if (need_horizontal) { + int interp_dim = 3; + auto stride = skip_unpacking ? num_channels : 4; + std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) = + F::compute_index_ranges_int16_weights( + /*input_size=*/xin, + /*output_size=*/xout, + /*stride=*/stride, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + if (need_vertical) { + int interp_dim = 2; + auto stride = skip_unpacking ? num_channels * xout : 4 * xout; + std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) = + F::compute_index_ranges_int16_weights( + /*input_size=*/yin, + /*output_size=*/yout, + /*stride=*/stride, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + at::Tensor buffer_horiz, buffer_vert; + // Minor optimization: we can avoid allocating an extra buffer if we're performing + // horizontal-only or vertical-only interpolation, and if the tensor doesn't + // need repacking + if (need_horizontal && (need_vertical || !skip_packing)) { + auto c = skip_unpacking ? num_channels : 4; + buffer_horiz = at::empty({c, yin, xout}, input.options()); + } + if (need_vertical && !skip_packing) { + auto c = skip_unpacking ? num_channels : 4; + buffer_vert = at::empty({c, yout, xout}, input.options()); + } + + for (const auto i : c10::irange(batch_size)) { + + at::Tensor unpacked_input = skip_unpacking ? input[i] : unpack_rgb(input[i]); + at::Tensor unpacked_output; + + if (need_horizontal) { + at::Tensor unpacked_output_temp = (need_vertical || !skip_packing) ? buffer_horiz : output[i]; + + if (skip_unpacking && num_channels == 3) { + ImagingResampleHorizontal<3>( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + } else { + ImagingResampleHorizontal<4>( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + } + unpacked_output = unpacked_input = unpacked_output_temp; + } + if (need_vertical) { + unpacked_output = skip_packing ? output[i] : buffer_vert; + + ImagingResampleVertical( + unpacked_output, + unpacked_input, + ksize_vert, + vert_indices_weights, + vert_weights_precision + ); + } + + TORCH_INTERNAL_ASSERT(unpacked_output.defined()); + + if (!skip_packing) { + pack_rgb(unpacked_output, output[i]); + } + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint8_t* C10_RESTRICT lineOut0, + uint8_t* C10_RESTRICT lineOut1, + uint8_t* C10_RESTRICT lineOut2, + uint8_t* C10_RESTRICT lineOut3, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn0, + const uint8_t* C10_RESTRICT lineIn1, + const uint8_t* C10_RESTRICT lineIn2, + const uint8_t* C10_RESTRICT lineIn3, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line) { + + // Interpolation horizontal pass processing together 4 vertical lines. + // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA + // we can encode 4 values as a single uint32 value. + // - We split the size of weight vector for a given output index as a sum: + // ids_size = num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1. + // - We load and process 4 weights values in a loop ("block 4") then we process 2 weights values + // in another loop ("block 2") and finally we process 1 weights value in the final loop ("block 1"). + + // Define shuffling masks (low/high) for num_channels 4 and 3 + // Mask low casts lower half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA: + // [r1 g1 b1 a1 r2 g2 b2 a2 ... | R1 G1 B1 A1 R2 G2 B2 A2 ... ] -> + // [r1 0 r2 0 g1 0 g2 0 b1 0 b2 0 a1 0 a2 0 | R1 0 R2 0 G1 0 G2 0 B1 0 B2 0 A1 0 A2 0] + // Mask high casts upper half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA:: + // [ ... r3 g3 b3 a3 r4 g4 b4 a4 | ... R3 G3 B3 A3 R4 G4 B4 A4 ] -> + // [r3 0 r4 0 g3 0 g4 0 b3 0 b4 0 a3 0 a4 0 | R3 0 R4 0 G3 0 G4 0 B3 0 B4 0 A3 0 A4 0] + + const auto mask_low_c4 = _mm256_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + const auto mask_high_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8); + const auto mask_low_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_high_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6); + + const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4; + const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4; + + const auto stride = num_channels * sizeof(uint8_t); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + // out_xsize = output width, out_x = output x index + // ids_min is the input offset index corresponding to out_x + // ids_size is the interpolation size for out_x + + // Let's precompute ids_size limits for block 4 and block 2. + // + // In block 4 (4 means we process 4 weight values together), we read input data + // with _mm_loadu_si128, i.e. 16 bytes, per one line: + // lineIn0 + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 16.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft + // RGBA: b4_delta = b4_delta_soft = 3 + // RGB : b4_delta = 5 + // RGB : b4_delta_soft = 4 + const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4); + + // In block 2 (2 means we process 2 weights values together), we read input data + // with _mm_loadl_epi64, i.e. 8 bytes, per one line: + // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 8.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft + // RGBA: b2_delta = b2_delta_soft = 1 + // RGB : b2_delta = 2 + // RGB : b2_delta_soft = 1 + const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1); + + const auto max_out_x_strided = out_xsize * stride; + const auto max_in_x_strided = in_xsize * stride; + + const auto zero = _mm256_setzero_si256(); + const auto initial = _mm256_set1_epi32(1 << (coefs_precision - 1)); + + for (const auto out_x : c10::irange(out_xsize)) { + const auto ids_min = idx_ptr_xmin[out_x]; + const auto ids_size = idx_ptr_size[out_x]; + const auto * k = &kk[out_x * kmax]; + int64_t i = 0; + + auto sss0 = initial; + auto sss1 = initial; + + const auto * lineIn0_min = lineIn0 + ids_min; + const auto * lineIn1_min = lineIn1 + ids_min; + const auto * lineIn2_min = lineIn2 + ids_min; + const auto * lineIn3_min = lineIn3 + ids_min; + + // block 4 + for (; i < ids_size - b4_delta; i += 4) { + // Load 4 values from weight vector + // mmk0 = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + // mmk1 = [wl_2 wh_2 wl_3 wh_3 wl_2 wh_2 wl_3 wh_3 ...] + const auto mmk0 = _mm256_set1_epi32(*(int32_t*)&k[i]); + const auto mmk1 = _mm256_set1_epi32(*(int32_t*)&k[i + 2]); + + // RGBA: Load 8 pixels (4 per line) from input lines 0 and 1: + // source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // R0 G0 B0 A0 R1 G1 B1 A1 R2 G2 B2 A2 R3 G3 B3 A3 + // ] + // RGB: Load 10 pixels (5 per line) + // source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + // ] + auto source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn0_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn1_min + stride * i)), 1); + + // Apply mask_low: + // RGBA: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 A0 0 A1 0] + // RGB: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 0 0 0 0] + auto pix1 = _mm256_shuffle_epi8(source, mask_low); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk0)); + + // Apply mask_high: + // RGBA: + // [r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 a2 0 a3 0 | R2 0 R3 0 G2 0 G3 0 B2 0 B3 0 A2 0 A3 0] + // RGB: + // [r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 0 0 0 0 | R2 0 R3 0 G2 0 G3 0 B2 0 B3 0 0 0 0 0] + auto pix2 = _mm256_shuffle_epi8(source, mask_high); + // Compute output value as C += w2 * C2 + w3 * C3 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix2, mmk1)); + + // Same as above to next lines 2 and 3: + auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn2_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn3_min + stride * i)), 1); + auto pix3 = _mm256_shuffle_epi8(source2, mask_low); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix3, mmk0)); + auto pix4 = _mm256_shuffle_epi8(source2, mask_high); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix4, mmk1)); + } + + // block 2 + for (; i < ids_size - b2_delta; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + const auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]); + + // Load 4 pixels (2 per line) from input lines 0 and 1: + // RGBA: source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // R0 G0 B0 A0 R1 G1 B1 A1 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 g1 b1 r2 0 0 0 0 0 0 0 0 + // R0 G0 B0 R1 G1 B1 R2 0 0 0 0 0 0 0 0 + // ] + auto source1 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) (lineIn0_min + stride * i))), + _mm_loadl_epi64((__m128i *) (lineIn1_min + stride * i)), 1); + // Apply mask_low: + // RGBA: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 A0 0 A1 0] + // RGB: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 0 0 0 0] + auto pix1 = _mm256_shuffle_epi8(source1, mask_low); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // Same as above for lines 2 and 3: + auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) (lineIn2_min + stride * i))), + _mm_loadl_epi64((__m128i *) (lineIn3_min + stride * i)), 1); + auto pix2 = _mm256_shuffle_epi8(source2, mask_low); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + // block 1 + const auto i32_aligned = num_channels == 4; + for (; i < ids_size - 1; i++) { + // Load 1 value from weight vector + // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...] + const auto mmk = _mm256_set1_epi32(k[i]); + + // Load 2 pixels (one per line) from input lines 0 and 1: + // RGBA: pix1 = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0 + // R0 0 0 0 G0 0 0 0 B0 0 0 0 A0 0 0 0 + // ] + // RGB: pix1 = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0 + // R0 0 0 0 G0 0 0 0 B0 0 0 0 R1 0 0 0 + // ] + auto pix1 = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1); + // Compute output value as C += w0 * C0 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // Same as above for lines 2 and 3 + auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned), 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + if (i == ids_size - 1) { + // last element + auto mmk = _mm256_set1_epi32(k[i]); + // For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes + // lines 0, 1 and 2 won't go out of allocated memory bounds + auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + auto p0 = mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned); + __m128i p1; + if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) { + uint8_t input[4]; + std::memcpy(input, lineIn3_min + stride * i, 3); + p1 = mm_cvtepu8_epi32(input, true); + } else { + p1 = mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned); + } + auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(p0), p1, 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0) + sss0 = _mm256_packs_epi32(sss0, zero); + sss1 = _mm256_packs_epi32(sss1, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d 0 0 0 0) + sss0 = _mm256_packus_epi16(sss0, zero); + sss1 = _mm256_packus_epi16(sss1, zero); + + // Write the output into single uint32 + // (a b c d) -> x_uint32 + auto o0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss0)); + auto o1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1)); + auto o2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss1)); + auto o3 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1)); + + const auto out_x_strided = stride * out_x; + + if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) { + // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write + // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). + // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct + // value which was previously computed by another line. In other words, it means that we can not overwrite + // it by simply writing 4 bytes from the register to the output. We'll do the following: + // v----------| + // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] + // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1) + // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1) + // Output = [... R G B | R1 G1 B1 R2 ...] + + _write_endline_rgb_as_uint32(lineOut0 + out_x_strided, o0); + _write_endline_rgb_as_uint32(lineOut1 + out_x_strided, o1); + _write_endline_rgb_as_uint32(lineOut2 + out_x_strided, o2); + + if (C10_UNLIKELY(is_last_line)) { + // When we handle the last line, we can not access the next 4 bytes + // as they are out of memory bounds. + std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, num_channels); + } else { + _write_endline_rgb_as_uint32(lineOut3 + out_x_strided, o3); + } + } else if (num_channels == 3) { + // Memcpy 4-bytes is faster than 3-bytes and here + // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value + // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...) + std::memcpy(lineOut0 + out_x_strided, (uint8_t *) &o0, 4); + std::memcpy(lineOut1 + out_x_strided, (uint8_t *) &o1, 4); + std::memcpy(lineOut2 + out_x_strided, (uint8_t *) &o2, 4); + std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, 4); + } else { + // num_channels = 4 -> lineOutX + out_x_strided should be uint32 aligned + *(uint32_t *)(lineOut0 + out_x_strided) = o0; + *(uint32_t *)(lineOut1 + out_x_strided) = o1; + *(uint32_t *)(lineOut2 + out_x_strided) = o2; + *(uint32_t *)(lineOut3 + out_x_strided) = o3; + } + } +} + +void ImagingResampleHorizontalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line) { + + // Interpolation horizontal pass processing only one vertical line. + // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA + // we can encode 4 values as a single uint32 value. + // - We split the size of weight vector for a given output index as a sum: + // ids_size = num_blocks_8 * 8 + num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1 + // - We load and process 8 weights values in a loop ("block 8") then 4 weights and 2 weights values in + // in another loops ("block 4" and "block 2") and finally we process 1 weight value in the final loop ("block 1"). + + // Define various shuffling masks + const auto kmask_low = _mm256_set_epi8( + 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, + 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + const auto kmask_high = _mm256_set_epi8( + 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, + 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4); + const auto kmask_hl = _mm256_set_epi8( + 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, + 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + + const auto mask_low_c4 = _mm256_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + const auto mask_high_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8); + const auto mask_low_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_high_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6); + const auto mask_hl_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_hl_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + + const auto mask_low128_c3 = _mm_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_low128_c4 = _mm_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + + const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4; + const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4; + const auto mask_hl = (num_channels == 3) ? mask_hl_c3 : mask_hl_c4; + const auto mask_low128 = (num_channels == 3) ? mask_low128_c3 : mask_low128_c4; + + // out_xsize = output width, out_x = output x index + // ids_min is the input offset index corresponding to out_x + // ids_size is the interpolation size for out_x + + const auto stride = num_channels * sizeof(uint8_t); + const auto zero = _mm_setzero_si128(); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + // Let's precompute ids_size limits for block 8, block 4 and block 2 + // + // In block 8 (8 means we process 8 weight values together), we read at + // most 32 bytes input data (16 + 16 bytes for RGBA and 12 + 16 bytes for RGB) + // lineIn + stride * (i + ids_min) + 32 <= lineIn + stride * (ids_size + ids_min) + // --> i <= ids_size - 32.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(32.0 / stride)) = ids_size - b8_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(32.0 / stride) = ids_size - b8_delta_soft + // RGBA: b8_delta = b8_delta_soft = 7 + // RGB : b8_delta = 10 + // RGB : b8_delta_soft = 9 + const auto b8_delta = (stride == 4) ? 7 : (is_last_line ? 10 : 9); + + // In block 4 (4 means we process 4 weight values together), we read + // 16 bytes of input data. + // lineIn + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 16.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft + // RGBA: b4_delta = b4_delta_soft = 3 + // RGB : b4_delta = 5 + // RGB : b4_delta_soft = 4 + const auto b4_delta = (stride == 4) ? 3 : (is_last_line ? 5 : 4); + + // In block 2 (2 means we process 2 weight values together), we read + // 8 bytes of input data. + // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 8.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft + // RGBA: b2_delta = b2_delta_soft = 1 + // RGB : b2_delta = 2 + // RGB : b2_delta_soft = 1 + const auto b2_delta = (stride == 4) ? 1 : (is_last_line ? 2 : 1); + + const auto max_out_x_strided = out_xsize * stride; + const auto max_in_x_strided = in_xsize * stride; + + for (const auto out_x : c10::irange(out_xsize)) { + __m128i sss; + const auto ids_min = idx_ptr_xmin[out_x]; + const auto ids_size = idx_ptr_size[out_x]; + const auto * k = &kk[out_x * kmax]; + int64_t i = 0; + + const auto * lineIn_min = lineIn + ids_min; + + if (ids_size < 8) { + sss = _mm_set1_epi32(1 << (coefs_precision - 1)); + } else { + // Lower part will be added to higher, use only half of the error + auto sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2)); + + // block 8 + for (; i < ids_size - b8_delta; i += 8) { + // Load 8 values from weight vector + auto tmp = _mm_loadu_si128((__m128i*)&k[i]); + // ksource = [ + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 wl_4 wh_4 wl_5 wh_5 wl_6 wh_6 wl_7 wh_7 + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 wl_4 wh_4 wl_5 wh_5 wl_6 wh_6 wl_7 wh_7 + // ] + auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // RGBA: Load 8 pixels from input: + // source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r4 g4 b4 a4 r5 g5 b5 a5 r6 g6 b6 a6 r7 g7 b7 a7 + // ] + // RGB: Load 10 pixels from input (however we can process only 8 pixels): + // source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r4 g4 b4 r5 g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 + // ] + auto source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1); + + // Extract lower part of each lane, cast to epi16 and reorder RGBARGBA -> RRGGBBAA + // RGBA: pix1 = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 + // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0 + // ] + // RGB: pix1 = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 + // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 0 0 0 0 + // ] + auto pix1 = _mm256_shuffle_epi8(source, mask_low); + // mmk1 = [ + // wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ... + // wl_4 wh_4 wl_5 wh_5 wl_4 wh_4 wl_5 wh_5 ... ... + // ] + auto mmk1 = _mm256_shuffle_epi8(ksource, kmask_low); + // Compute output value as + // C += w0 * C0 + w1 * C1 + // C += w4 * C4 + w5 * C5 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix1, mmk1)); + + // Same as above for higher part of each lane + auto pix2 = _mm256_shuffle_epi8(source, mask_high); + auto mmk2 = _mm256_shuffle_epi8(ksource, kmask_high); + // Compute output value as + // C += w2 * C2 + w3 * C3 + // C += w6 * C6 + w7 * C7 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix2, mmk2)); + } + + // block 4 + for (; i < ids_size - b4_delta; i += 4) { + // Load 4 values from weight vector + auto tmp = _mm_loadl_epi64((__m128i *) &k[i]); + // ksource = [ + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 0 0 0 0 0 0 0 0 + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 0 0 0 0 0 0 0 0 + // ] + auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // Load pixels from input line + tmp = _mm_loadu_si128((__m128i *) (lineIn_min + stride * i)); + // RGBA: source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // ] + // RGB: source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // ] + auto source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA + // RGBA: pix = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 + // r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 a2 0 a3 0 + // ] + // RGB: pix = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 + // r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 0 0 0 0 + // ] + auto pix = _mm256_shuffle_epi8(source, mask_hl); + // mmk = [ + // wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ... + // wl_2 wh_2 wl_3 wh_3 wl_2 wh_2 wl_3 wh_3 ... ... + // ] + auto mmk = _mm256_shuffle_epi8(ksource, kmask_hl); + // Compute output value as + // C += w0 * C0 + w1 * C1 + // C += w2 * C2 + w3 * C3 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + } + + // Sum results between the lanes + sss = _mm_add_epi32( + _mm256_extracti128_si256(sss256, 0), + _mm256_extracti128_si256(sss256, 1)); + } + + // block 2 + for (; i < ids_size - b2_delta; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + // Load pixels from input line + // RGBA: source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // ] + // RGB: source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 0 0 0 0 0 0 0 0 + // ] + auto source = _mm_loadl_epi64((__m128i *) (lineIn_min + stride * i)); + // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA + auto pix = _mm_shuffle_epi8(source, mask_low128); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // block 1 + const auto i32_aligned = num_channels == 4; + for (; i < ids_size - 1; i++) { + // Load 1 value from weight vector + // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...] + auto mmk = _mm_set1_epi32(k[i]); + // Load one pixel from input line + // RGBA: pix = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0 + // ] + // RGB: pix = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0 + // ] + auto pix = mm_cvtepu8_epi32(lineIn_min + stride * i, i32_aligned); + // Compute output value as C += w0 * C0 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + if (i == ids_size - 1) { + // last element + auto mmk = _mm_set1_epi32(k[i]); + __m128i pix; + auto p = lineIn_min + stride * i; + if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) { + uint8_t input[4]; + std::memcpy(input, p, 3); + pix = mm_cvtepu8_epi32(input, true); + } else { + pix = mm_cvtepu8_epi32(p, i32_aligned); + } + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss = _mm_srai_epi32(sss, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0) + sss = _mm_packs_epi32(sss, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d 0 0 0 0) + sss = _mm_packus_epi16(sss, zero); + // Write the output into single uint32 + // (a b c d) -> x_uint32 + auto o = _mm_cvtsi128_si32(sss); + const auto out_x_strided = stride * out_x; + if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) { + if (C10_UNLIKELY(is_last_line)) { + // When we handle the last line, we can not access the next 4 bytes + // as they are out of memory bounds. + std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 3); + } else { + // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write + // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). + // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct + // value which was previously computed by another line. In other words, it means that we can not overwrite + // it by simply writing 4 bytes from the register to the output. We'll do the following: + // v----------| + // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] + // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1) + // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1) + // Output = [... R G B | R1 G1 B1 R2 ...] + _write_endline_rgb_as_uint32(lineOut + out_x_strided, o); + } + } else if (num_channels == 3) { + // Memcpy 4-bytes is faster than 3-bytes and here + // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value + // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...) + std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 4); + } else { + // num_channels = 4 -> lineOut + out_x_strided should be uint32 aligned + *(uint32_t *)(lineOut + out_x_strided) = o; + } + } +} + +void ImagingResampleVerticalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + const uint8_t* C10_RESTRICT lineIn, + int64_t xsize, + int64_t ids_min, + int64_t ids_size, + const int16_t* k, + unsigned int coefs_precision, + int64_t num_channels) { + + // Interpolation vertical pass processing one line. + // - We process x-axis data with blocks of 8, 2 and 1 + // - We split the size of weight vector for a given output index as a sum: K = n * 2 + m. + + // xsize = output width, also equals to input width + // ids_size = interpolation size + // ids_min = input y start index + const auto stride = num_channels * sizeof(uint8_t); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + const int64_t data_size = xsize * stride; + const int64_t data_stride = stride; + constexpr auto vec_size = 256 / 8; + + const auto initial = _mm_set1_epi32(1 << (coefs_precision - 1)); + const auto initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1)); + const auto zero = _mm_setzero_si128(); + const auto zero_256 = _mm256_setzero_si256(); + + int64_t j = 0; + // block 8 + const auto b8_usable_vec_stride = (vec_size / data_stride) * data_stride; + for (; j < data_size - vec_size; j += b8_usable_vec_stride) { + auto sss0 = initial_256; + auto sss1 = initial_256; + auto sss2 = initial_256; + auto sss3 = initial_256; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]); + + // RGBA: Load 8 pixels per line + // source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r4 g4 b4 a4 r5 g5 b5 a5 r6 g6 b6 a6 r7 g7 b7 a7 + // ] + // RGB: Load 10 pixels per line (however we can process only 8 pixels): + // source1 = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r4 g4 b4 r5 g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 + // ] + auto source1 = + _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * i)); + auto source2 = + _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * (i + 1))); + + // Interleave source1 and source2 from the low half of each 128-bit lane + // and cast the result to epi16 + // RGBA: pix1 = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 a1 0 A1 0 + // ] + // RGB: pix1 = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 0 0 0 0 + // ] + auto source_lo = _mm256_unpacklo_epi8(source1, source2); + auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256); + // Compute output value as + // C += w0 * c0 + w1 * C0 + // C += w0 * c1 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // RGBA: pix2 = [ + // r2 0 R2 0 g2 0 G2 0 b2 0 B2 0 a2 0 A2 0 + // r3 0 R3 0 g3 0 G3 0 b3 0 B3 0 a3 0 A3 0 + // ] + // RGB: pix2 = [ + // r2 0 R2 0 g2 0 G2 0 b2 0 B2 0 0 0 0 0 + // r3 0 R3 0 g3 0 G3 0 b3 0 B3 0 0 0 0 0 + // ] + auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256); + // Compute output value as + // C += w0 * c2 + w1 * C2 + // C += w0 * c3 + w1 * C3 for each channel in 32-bit precision + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + + // Same as above for the high half of each 128-bit lane + auto source_hi = _mm256_unpackhi_epi8(source1, source2); + auto pix3 = _mm256_unpacklo_epi8(source_hi, zero_256); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk)); + auto pix4 = _mm256_unpackhi_epi8(source_hi, zero_256); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk)); + } + // Same processing as above but with a single weight value + for (; i < ids_size; i += 1) { + auto mmk = _mm256_set1_epi32(k[i]); + + auto source1 = _mm256_loadu_si256((__m256i*)(lineIn_min + i * data_size)); + + auto source_lo = _mm256_unpacklo_epi8(source1, zero_256); + auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + + auto source_hi = _mm256_unpackhi_epi8(source1, zero_256); + auto pix3 = _mm256_unpacklo_epi8(source_hi, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk)); + auto pix4 = _mm256_unpackhi_epi8(source_hi, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk)); + } + // Convert fixed point values back to integers (truncating) + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss2 = _mm256_srai_epi32(sss2, coefs_precision); + sss3 = _mm256_srai_epi32(sss3, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss0 = _mm256_packs_epi32(sss0, sss1); + sss2 = _mm256_packs_epi32(sss2, sss3); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss0 = _mm256_packus_epi16(sss0, sss2); + + // Stores 32 bytes + _mm256_storeu_si256((__m256i*)(lineOut + j), sss0); + } + + // TODO: Do we also need block 4 ??? + // block 2 + const auto b2_usable_vec_stride = (8 / data_stride) * data_stride; + for (; j < data_size - vec_size / 4; j += b2_usable_vec_stride) { + auto sss0 = initial; + auto sss1 = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load 2 pixels per line + // RGBA: source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 g1 b1 r2 g2 0 0 0 0 0 0 0 0 + // ] + auto source1 = _mm_loadl_epi64((__m128i *) (lineIn_min + i * data_size)); + auto source2 = _mm_loadl_epi64((__m128i *) (lineIn_min + (i + 1) * data_size)); + // Interleave source1 and source2 and cast the result to epi16 + // RGBA: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // ] + // RGB: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // ] + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + // RGBA: pix = [ + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 a1 0 A1 0 + // ] + // RGB: pix = [ + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 0 0 0 0 + // ] + pix = _mm_unpackhi_epi8(source, zero); + // Compute output value as C += w0 * c1 + w1 * C1 for each channel in 32-bit precision + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + // Same processing as above but with a single weight value + for (; i < ids_size; i += 1) { + auto mmk = _mm_set1_epi32(k[i]); + + auto source1 = _mm_loadl_epi64((__m128i*) (lineIn_min + i * data_size)); + + auto source = _mm_unpacklo_epi8(source1, zero); + auto pix1 = _mm_unpacklo_epi8(source, zero); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix1, mmk)); + auto pix2 = _mm_unpackhi_epi8(source, zero); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix2, mmk)); + } + // Convert fixed point values back to integers (truncating) + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss0 = _mm_packs_epi32(sss0, sss1); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss0 = _mm_packus_epi16(sss0, sss0); + // Store 2 pixels to the output + _mm_storel_epi64((__m128i*)(lineOut + j), sss0); + } + + // block 1 + const auto b1_usable_vec_stride = (4 / data_stride) * data_stride; + const auto i32_aligned = num_channels == 4; + for (; j < data_size - 4; j += b1_usable_vec_stride) { + auto sss = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load one pixel per line + // RGBA: source1 = [ + // r0 g0 b0 a0 0 0 0 0 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 0 0 0 0 0 0 0 0 0 0 0 0 + // ] + auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned); + auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned); + + // Interleave source1 and source2 and cast the result to epi16 + // RGBA: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // ] + // RGB: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // ] + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; i < ids_size; i++) { + auto mmk = _mm_set1_epi32(k[i]); + auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size, i32_aligned); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, zero); + sss = _mm_packus_epi16(sss, zero); + + auto o = _mm_cvtsi128_si32(sss); + + // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3 + // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data. + // We also won't go out of bounds of lineOut memory allocation + std::memcpy(lineOut + j, (uint8_t *) &o, 4); + } + + for (; j < data_size; j += data_stride) { + auto sss = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + // For RGBA we can use (ids_size - 1) as tighter limit but for RGB we can read outside memory boundary + // for the last remaining line + for (; i < ids_size - 2; i += 2) { + // Load two coefficients at once + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load 2 lines + auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned); + auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned); + + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Same processing as above but with a single weight value + for (; i < ids_size; i++) { + auto mmk = _mm_set1_epi32(k[i]); + + const uint8_t * p = lineIn_min + i * data_size; + __m128i pix; + // There is no much perf gain using more detailed condition like + // num_channels == 3 && ids_min + j + data_size * i + 4 >= in_max_size + // const int64_t in_max_size = data_size * in_ysize; + if (num_channels == 3) { + uint8_t input[4]; + std::memcpy(input, p, 3); + pix = mm_cvtepu8_epi32(input, true); + } else { + pix = mm_cvtepu8_epi32(p, true); + } + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss = _mm_srai_epi32(sss, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss = _mm_packs_epi32(sss, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss = _mm_packus_epi16(sss, zero); + // Store one pixel to the output + auto o = _mm_cvtsi128_si32(sss); + if (num_channels == 3 && C10_UNLIKELY(j + 4 >= data_size)) { + std::memcpy(lineOut + j, (uint8_t *) &o, 3); + } else { + std::memcpy(lineOut + j, (uint8_t *) &o, 4); + } + } +} + +} // anonymous namespace +#endif // CPU_CAPABILITY_AVX2 + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h new file mode 100644 index 0000000000000000000000000000000000000000..73e827054b4830bfa4154a511aa4feca133cd391 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/avx_mathfun.h @@ -0,0 +1,527 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +/* + AVX implementation of sin, cos, sincos, exp and log + + Based on "sse_mathfun.h", by Julien Pommier + http://gruntthepeon.free.fr/ssemath/ + + Copyright (C) 2012 Giovanni Garberoglio + Interdisciplinary Laboratory for Computational Science (LISC) + Fondazione Bruno Kessler and University of Trento + via Sommarive, 18 + I-38123 Trento (Italy) + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ + +#include + +/* The original source of this file has been modified. */ +#if defined(CPU_CAPABILITY_AVX2) + +#if defined(__GNUC__) +# define ALIGN32_BEG __attribute__((aligned(32))) +#elif defined(_WIN32) +# define ALIGN32_BEG __declspec(align(32)) +#endif + +typedef __m256 v8sf; // vector of 8 float (avx2) +typedef __m256i v8si; // vector of 8 int (avx2) + +/* declare some AVX constants -- why can't I figure a better way to do that? */ +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] = { Val, Val, Val, Val, Val, Val, Val, Val } + +_PS256_CONST(1 , 1.0f); +_PS256_CONST(0p5, 0.5f); +/* the smallest non denormalized float number */ +_PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); +_PS256_CONST_TYPE(mant_mask, int, 0x7f800000); +_PS256_CONST_TYPE(inv_mant_mask, int, ~0x7f800000); + +_PS256_CONST_TYPE(sign_mask, int, (int)0x80000000); +_PS256_CONST_TYPE(inv_sign_mask, int, ~0x80000000); + +_PI32_CONST256(0, 0); +_PI32_CONST256(1, 1); +_PI32_CONST256(inv1, ~1); +_PI32_CONST256(2, 2); +_PI32_CONST256(4, 4); +_PI32_CONST256(0x7f, 0x7f); + +_PS256_CONST(cephes_SQRTHF, 0.707106781186547524); +_PS256_CONST(cephes_log_p0, 7.0376836292E-2); +_PS256_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS256_CONST(cephes_log_p2, 1.1676998740E-1); +_PS256_CONST(cephes_log_p3, - 1.2420140846E-1); +_PS256_CONST(cephes_log_p4, + 1.4249322787E-1); +_PS256_CONST(cephes_log_p5, - 1.6668057665E-1); +_PS256_CONST(cephes_log_p6, + 2.0000714765E-1); +_PS256_CONST(cephes_log_p7, - 2.4999993993E-1); +_PS256_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS256_CONST(cephes_log_q1, -2.12194440e-4); +_PS256_CONST(cephes_log_q2, 0.693359375); + + +/* natural logarithm computed for 8 simultaneous float + return NaN for x <= 0 +*/ +inline v8sf log256_ps(v8sf x) { + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); + + x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ + + // can be done with AVX2 + imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); + + /* keep only the fractional part */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); + + // this is again another AVX2 instruction + imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); + v8sf e = _mm256_cvtepi32_ps(imm0); + + e = _mm256_add_ps(e, one); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf tmp = _mm256_and_ps(x, mask); + x = _mm256_sub_ps(x, one); + e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); + x = _mm256_add_ps(x, tmp); + + v8sf z = _mm256_mul_ps(x,x); + + v8sf y = *(v8sf*)_ps256_cephes_log_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); + y = _mm256_mul_ps(y, x); + + y = _mm256_mul_ps(y, z); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); + + + tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); + y = _mm256_sub_ps(y, tmp); + + tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); + x = _mm256_add_ps(x, y); + x = _mm256_add_ps(x, tmp); + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + return x; +} + +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); + +_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); +_PS256_CONST(cephes_exp_C1, 0.693359375); +_PS256_CONST(cephes_exp_C2, -2.12194440e-4); + +_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); +_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); +_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); +_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); +_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); +_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); + +inline v8sf exp256_ps(v8sf x) { + v8sf tmp = _mm256_setzero_ps(), fx; + v8si imm0; + v8sf one = *(v8sf*)_ps256_1; + + x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); + + /* how to perform a floorf with SSE: just below */ + //imm0 = _mm256_cvttps_epi32(fx); + //tmp = _mm256_cvtepi32_ps(imm0); + + tmp = _mm256_floor_ps(fx); + + /* if greater, subtract 1 */ + //v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + mask = _mm256_and_ps(mask, one); + fx = _mm256_sub_ps(tmp, mask); + + tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); + x = _mm256_sub_ps(x, tmp); + x = _mm256_sub_ps(x, z); + + z = _mm256_mul_ps(x,x); + + v8sf y = *(v8sf*)_ps256_cephes_exp_p0; + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); + y = _mm256_mul_ps(y, x); + y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, x); + y = _mm256_add_ps(y, one); + + /* build 2^n */ + imm0 = _mm256_cvttps_epi32(fx); + // another two AVX2 instructions + imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = _mm256_slli_epi32(imm0, 23); + v8sf pow2n = _mm256_castsi256_ps(imm0); + y = _mm256_mul_ps(y, pow2n); + return y; +} + +_PS256_CONST(minus_cephes_DP1, -0.78515625); +_PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); +_PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); +_PS256_CONST(sincof_p0, -1.9515295891E-4); +_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p2, -1.6666654611E-1); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p1, -1.388731625493765E-003); +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI + + +/* evaluation of 8 sines at once using AVX intrinsics + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. + +*/ +inline v8sf sin256_ps(v8sf x) { // any x + v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; + v8si imm0, imm2; + + sign_bit = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); + + /* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives + */ + + /* store the integer part of y in mm0 */ + imm2 = _mm256_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + // another two AVX2 instruction + imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); + imm2 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); + y = _mm256_cvtepi32_ps(imm2); + + /* get the swap sign flag */ + imm0 = _mm256_and_si256(imm2, *(v8si*)_pi32_256_4); + imm0 = _mm256_slli_epi32(imm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 +#include + +namespace at::native { + +using weight_to_int4pack_fn = void (*)(const Tensor&, const Tensor&); +using int4pack_mm_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&); +using int8pack_mm_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&); +using dyn_quant_pack_4bit_weight_fn = void (*)( + Tensor&, + const Tensor&, + const Tensor&, + const std::optional& bias, + const int64_t, + const int64_t, + const int64_t); +using dyn_quant_matmul_4bit_fn = void (*)( + const Tensor&, + const Tensor&, + const Tensor&, + const int64_t, + const int64_t, + const int64_t, + const int64_t); + +DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub) +DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub) +DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub) +DECLARE_DISPATCH( + dyn_quant_pack_4bit_weight_fn, + dyn_quant_pack_4bit_weight_stub) +DECLARE_DISPATCH(dyn_quant_matmul_4bit_fn, dyn_quant_matmul_4bit_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h new file mode 100644 index 0000000000000000000000000000000000000000..dc915e107bfb4dd001d904952f156c410b7ddf54 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +inline ScalarType first_type() { + return ScalarType::Undefined; +} + +template +inline ScalarType first_type(const Tensor& arg, const Args&... parameters) { + return arg.defined() ? arg.scalar_type() : first_type(parameters...); +} + +template +inline bool is_mixed_type(const Tensor& input, const Args&... parameters) { + const auto parameter_type = first_type(parameters...); + return ((parameter_type != ScalarType::Undefined) && + (parameter_type != input.scalar_type())); +} + +// currently on CPU, mixed data type is only supported +// when input is 'BFloat16' or 'Half' and parameters are 'Float' +inline void check_mixed_data_type(const Tensor& input) { + TORCH_CHECK(at::isReducedFloatingType(input.scalar_type()), + "mixed dtype (CPU): all inputs must share same datatype."); +} + +template +inline void check_mixed_data_type(const Tensor& input, const Tensor& parameter, const Args&... parameters) { + TORCH_CHECK(!parameter.defined() || parameter.scalar_type() == ScalarType::Float, + "mixed dtype (CPU): expect parameter to have scalar type of Float"); + check_mixed_data_type(input, parameters...); +} + +inline ScalarType param_scalar_type(const Tensor& t, bool is_mixed_type) { + return is_mixed_type ? ScalarType::Float : t.scalar_type(); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..492d89906e3657bb247f4b888b06e3aa5eeb5fee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/utils.h @@ -0,0 +1,225 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#ifdef USE_FBGEMM +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") +#include +C10_DIAGNOSTIC_POP() +#endif + +namespace at::native { + +template +inline void _store(T* dst, at::vec::Vectorized src) { + src.store(dst); +} + +inline void _store(at::BFloat16* dst, at::vec::Vectorized src) { + auto res = at::vec::convert_float_bfloat16(src, src); + res.store(dst, at::vec::Vectorized::size()); +} + +inline void _store(at::Half* dst, at::vec::Vectorized src) { + auto res = at::vec::convert_float_half(src, src); + res.store(dst, at::vec::Vectorized::size()); +} + +inline namespace CPU_CAPABILITY { + +template +inline T data_index_init(T offset) { + return offset; +} + +template +inline T data_index_init(T offset, T& x, const T& X, Args&&... args) { + offset = data_index_init(offset, std::forward(args)...); + x = offset % X; + return offset / X; +} + +inline bool data_index_step() { + return true; +} + +template +inline bool data_index_step(T& x, const T& X, Args&&... args) { + if (data_index_step(std::forward(args)...)) { + x = ((x + 1) == X) ? 0 : (x + 1); + return x == 0; + } + return false; +} + +// Helper struct for bfloat16/float16 vectorization +// Useful when you need float as immediate dtype or accumulate dtype +using namespace vec; +struct Vec2 { + Vectorized val0, val1; + Vec2(Vectorized v0, Vectorized v1) : val0(v0), val1(v1) {} + Vec2(float v) : val0(v), val1(v) {} + static Vec2 loadu(const BFloat16* ptr) { + auto [v0, v1] = convert_bfloat16_float(Vectorized::loadu(ptr)); + return {v0, v1}; + } + static Vec2 loadu(const Half* ptr) { + auto [v0, v1] = convert_half_float(Vectorized::loadu(ptr)); + return {v0, v1}; + } + static Vec2 loadu(const float* ptr) { + return {Vectorized::loadu(ptr), Vectorized::loadu(ptr + Vectorized::size())}; + } + void store(BFloat16* ptr) const { + Vectorized val = convert_float_bfloat16(val0, val1); + val.store(ptr); + } + void store(Half* ptr) const { + Vectorized val = convert_float_half(val0, val1); + val.store(ptr); + } + void store(float* ptr) const { + val0.store(ptr); + val1.store(ptr + Vectorized::size()); + } +}; +inline Vec2 operator+(const Vec2& a, const Vec2& b) { return {a.val0 + b.val0, a.val1 + b.val1}; } +inline Vec2 operator*(const Vec2& a, const Vec2& b) { return {a.val0 * b.val0, a.val1 * b.val1}; } +inline Vec2 operator-(const Vec2& a, const Vec2& b) { return {a.val0 - b.val0, a.val1 - b.val1}; } +inline Vec2 operator/(const Vec2& a, const Vec2& b) { return {a.val0 / b.val0, a.val1 / b.val1}; } +inline Vec2 maximum(const Vec2& a, const Vec2& b) { return {vec::maximum(a.val0, b.val0), vec::maximum(a.val1, b.val1)}; } +inline Vec2 minimum(const Vec2& a, const Vec2& b) { return {vec::minimum(a.val0, b.val0), vec::minimum(a.val1, b.val1)}; } + +template struct VectorizedType { using type = Vectorized; }; +template <> struct VectorizedType { using type = Vec2; }; +template <> struct VectorizedType { using type = Vec2; }; +template using VecType = typename VectorizedType::type; + +// Helper for mixed data type parameter Vec::load +inline std::tuple, Vectorized> load2f(const BFloat16* ptr) { + return convert_bfloat16_float(Vectorized::loadu(ptr)); +} + +inline std::tuple, Vectorized> load2f(const Half* ptr) { + return convert_half_float(Vectorized::loadu(ptr)); +} + +inline std::tuple, Vectorized> load2f(const float* ptr) { + using Vec = Vectorized; + return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size())); +} + +inline std::tuple, Vectorized> load2f(const BFloat16* ptr, int64_t count) { + return convert_bfloat16_float(Vectorized::loadu(ptr, count)); +} + +inline std::tuple, Vectorized> load2f(const Half* ptr, int64_t count) { + return convert_half_float(Vectorized::loadu(ptr, count)); +} + +inline std::tuple, Vectorized> load2f(const float* ptr, int64_t count) { + using Vec = Vectorized; + if (count > Vec::size()) { + return std::make_tuple(Vec::loadu(ptr), Vec::loadu(ptr + Vec::size(), count - Vec::size())); + } else { + return std::make_tuple(Vec::loadu(ptr, count), Vec(0)); + } +} + +} // namespace + +namespace utils { + +template +T CeilLog2(const T& x) { + if (x <= 2) { + return 1; + } + // Last set bit is floor(log2(x)), floor + 1 is ceil + // except when x is an exact powers of 2, so subtract 1 first + return static_cast(llvm::findLastSet(static_cast(x) - 1)) + 1; +} + +// matrix transpose: +// src has shape of M by N, with leading dimension of ld_src +// dst has shape of N by M, with leading dimension of ld_dst +template +inline void transpose(int64_t M, int64_t N, const T* src, int64_t ld_src, T* dst, int64_t ld_dst) { + for (int64_t j = 0; j < N; j++) { + for (int64_t i = 0; i < M; i++) { + dst[j * ld_dst + i] = c10::load(&(src[i * ld_src + j])); + } + } +} + +#ifdef USE_FBGEMM +template <> +inline void transpose(int64_t M, int64_t N, const float* src, int64_t ld_src, float* dst, int64_t ld_dst) { + TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); +} + +template <> +inline void transpose(int64_t M, int64_t N, const uint16_t* src, int64_t ld_src, uint16_t* dst, int64_t ld_dst) { + TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); +} + +template <> +inline void transpose(int64_t M, int64_t N, const uint8_t* src, int64_t ld_src, uint8_t* dst, int64_t ld_dst) { + TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); + fbgemm::transpose_simd(M, N, src, ld_src, dst, ld_dst); +} +#endif + +template +inline void parallel_sparse_csr( + const TensorAccessor& crow_acc, + const int64_t M, + const int64_t nnz, + const F& f) { + TORCH_CHECK(crow_acc.size(0) == M + 1); + + // directly parallel on `M` may lead to load imbalance, + // statically determine thread partition here to average payload + // for each thread. + int num_threads = at::get_num_threads(); + std::vector thread_splits(num_threads + 1, M); + + int64_t thread_averge_payload = std::max((int64_t)1, divup(nnz, num_threads)); + + thread_splits[0] = 0; + int64_t sum = 0; + int64_t t = 1; + for (const auto m : c10::irange(M)) { + int64_t row_start = crow_acc[m]; + int64_t row_end = crow_acc[m + 1]; + sum += row_end - row_start; + if (sum > t * thread_averge_payload) { + thread_splits[t] = m; + t++; + } + } + // need to restore the last index, + // due to rounding error when calculating `thread_averge_payload`. + thread_splits[num_threads] = M; + + at::parallel_for(0, num_threads, 1, [&](int64_t cbegin, int64_t cend) { + int tid = at::get_thread_num(); + int64_t begin = thread_splits[tid]; + int64_t end = thread_splits[tid + 1]; + f(begin, end); + }); +} + +} // namespace utils + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/zmath.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/zmath.h new file mode 100644 index 0000000000000000000000000000000000000000..81b7978bf77b3da919aa64f20eee91de249b21ec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cpu/zmath.h @@ -0,0 +1,255 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// Complex number math operations that act as no-ops for other dtypes. +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +template +inline VALUE_TYPE zabs (SCALAR_TYPE z) { + return z; +} + +template<> +inline c10::complex zabs > (c10::complex z) { + return c10::complex(std::abs(z)); +} + +template<> +inline float zabs , float> (c10::complex z) { + return std::abs(z); +} + +template<> +inline c10::complex zabs > (c10::complex z) { + return c10::complex(std::abs(z)); +} + +template<> +inline double zabs , double> (c10::complex z) { + return std::abs(z); +} + +// This overload corresponds to non-complex dtypes. +// The function is consistent with its NumPy equivalent +// for non-complex dtypes where `pi` is returned for +// negative real numbers and `0` is returned for 0 or positive +// real numbers. +// Note: `nan` is propagated. +template +inline VALUE_TYPE angle_impl (SCALAR_TYPE z) { + if (at::_isnan(z)) { + return z; + } + return z < 0 ? c10::pi : 0; +} + +template<> +inline c10::complex angle_impl > (c10::complex z) { + return c10::complex(std::arg(z), 0.0); +} + +template<> +inline float angle_impl , float> (c10::complex z) { + return std::arg(z); +} + +template<> +inline c10::complex angle_impl > (c10::complex z) { + return c10::complex(std::arg(z), 0.0); +} + +template<> +inline double angle_impl , double> (c10::complex z) { + return std::arg(z); +} + +template +constexpr VALUE_TYPE real_impl (SCALAR_TYPE z) { + return z; //No-Op +} + +template<> +constexpr c10::complex real_impl > (c10::complex z) { + return c10::complex(z.real(), 0.0); +} + +template<> +constexpr float real_impl , float> (c10::complex z) { + return z.real(); +} + +template<> +constexpr c10::complex real_impl > (c10::complex z) { + return c10::complex(z.real(), 0.0); +} + +template<> +constexpr double real_impl , double> (c10::complex z) { + return z.real(); +} + +template +constexpr VALUE_TYPE imag_impl (SCALAR_TYPE /*z*/) { + return 0; +} + +template<> +constexpr c10::complex imag_impl > (c10::complex z) { + return c10::complex(z.imag(), 0.0); +} + +template<> +constexpr float imag_impl , float> (c10::complex z) { + return z.imag(); +} + +template<> +constexpr c10::complex imag_impl > (c10::complex z) { + return c10::complex(z.imag(), 0.0); +} + +template<> +constexpr double imag_impl , double> (c10::complex z) { + return z.imag(); +} + +template +inline TYPE conj_impl (TYPE z) { + return z; //No-Op +} + +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex{z.real(), -z.imag()}; +} + +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex(z.real(), -z.imag()); +} + +template<> +inline c10::complex conj_impl > (c10::complex z) { + return c10::complex(z.real(), -z.imag()); +} + +template +inline TYPE ceil_impl (TYPE z) { + return std::ceil(z); +} + +template <> +inline c10::complex ceil_impl (c10::complex z) { + return c10::complex(std::ceil(z.real()), std::ceil(z.imag())); +} + +template <> +inline c10::complex ceil_impl (c10::complex z) { + return c10::complex(std::ceil(z.real()), std::ceil(z.imag())); +} + +template +inline c10::complex sgn_impl (c10::complex z) { + if (z == c10::complex(0, 0)) { + return c10::complex(0, 0); + } else { + return z / zabs(z); + } +} + +template +inline TYPE floor_impl (TYPE z) { + return std::floor(z); +} + +template <> +inline c10::complex floor_impl (c10::complex z) { + return c10::complex(std::floor(z.real()), std::floor(z.imag())); +} + +template <> +inline c10::complex floor_impl (c10::complex z) { + return c10::complex(std::floor(z.real()), std::floor(z.imag())); +} + +template +inline TYPE round_impl (TYPE z) { + return std::nearbyint(z); +} + +template <> +inline c10::complex round_impl (c10::complex z) { + return c10::complex(std::nearbyint(z.real()), std::nearbyint(z.imag())); +} + +template <> +inline c10::complex round_impl (c10::complex z) { + return c10::complex(std::nearbyint(z.real()), std::nearbyint(z.imag())); +} + +template +inline TYPE trunc_impl (TYPE z) { + return std::trunc(z); +} + +template <> +inline c10::complex trunc_impl (c10::complex z) { + return c10::complex(std::trunc(z.real()), std::trunc(z.imag())); +} + +template <> +inline c10::complex trunc_impl (c10::complex z) { + return c10::complex(std::trunc(z.real()), std::trunc(z.imag())); +} + +template ::value, int> = 0> +inline TYPE max_impl (TYPE a, TYPE b) { + if (_isnan(a) || _isnan(b)) { + return std::numeric_limits::quiet_NaN(); + } else { + return std::max(a, b); + } +} + +template ::value, int> = 0> +inline TYPE max_impl (TYPE a, TYPE b) { + if (_isnan(a)) { + return a; + } else if (_isnan(b)) { + return b; + } else { + return std::abs(a) > std::abs(b) ? a : b; + } +} + +template ::value, int> = 0> +inline TYPE min_impl (TYPE a, TYPE b) { + if (_isnan(a) || _isnan(b)) { + return std::numeric_limits::quiet_NaN(); + } else { + return std::min(a, b); + } +} + +template ::value, int> = 0> +inline TYPE min_impl (TYPE a, TYPE b) { + if (_isnan(a)) { + return a; + } else if (_isnan(b)) { + return b; + } else { + return std::abs(a) < std::abs(b) ? a : b; + } +} + +} // end namespace +} //end at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5540b6143da661f7071a497610c0faa2919513c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh @@ -0,0 +1,332 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// Jiterator functions are guarded behind this macro +#if AT_USE_JITERATOR() + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace at::native { + +template +// warning : unused parameter when tuple is empty. +constexpr auto tuple_to_array_helper(const Tuple& t [[maybe_unused]], std::index_sequence seq) { + constexpr auto size = seq.size(); + return std::array{static_cast(&std::get(t))...}; +} + +// Helper function convert tuple to std::array +// for passing the arguments to CUDA Kernel +// NOTE: We capture tuple by reference, +// so the pointers in returned array are only valid +// till tuple is alive. +template +constexpr auto tuple_to_array(const std::tuple& extra_args) { + constexpr auto tuple_size = sizeof...(Args); + return tuple_to_array_helper(extra_args, std::make_index_sequence{}); +} + +struct JittedVecKernelCache { + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + at::cuda::jit::NvrtcFunction vec1; + at::cuda::jit::NvrtcFunction vec2; + at::cuda::jit::NvrtcFunction vec4; + at::cuda::jit::NvrtcFunction vec8; +#ifdef USE_ROCM + at::cuda::jit::NvrtcFunction vec16; +#endif + +}; + +struct JittedKernelVariantCache { + JittedVecKernelCache vec; + at::cuda::jit::NvrtcFunction noncontiguous; + at::cuda::jit::NvrtcFunction dynamic_contiguous; + at::cuda::jit::NvrtcFunction dynamic_noncontiguous; +}; + +inline c10::SmallBuffer pack_kernel_args( + std::initializer_list args, + c10::ArrayRef extra_args) { + c10::SmallBuffer ret(args.size() + extra_args.size()); + std::copy(args.begin(), args.end(), ret.data()); + std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size()); + return ret; +} + +template +void launch_jitted_unrolled_kernel( + std::mutex &jiterator_mutex, + at::cuda::jit::NvrtcFunction &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int64_t N, + array_t data, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s, + bool contiguous, + at::cuda::jit::BinaryFuncVariant scalar_pos, + const void* scalar_val, + c10::ArrayRef extra_args) { + + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + + int tws = at::cuda::jit::calc_thread_work_size(desc.nInputs, desc.nOutputs, desc.f_inputs_type, desc.result_type); + int bws = tws * num_threads(); + //casting result to int is always safe, intermediate is int64 and won't overflow + const uint32_t grid = (N + bws - 1) / bws; + + if (!fn_cache.function) { + const std::lock_guard lock{jiterator_mutex}; + if (!fn_cache.function) { + constexpr bool dynamic_casting = !std::is_same() || + !std::is_same(); + auto code = at::cuda::jit::generate_code( + desc, contiguous, dynamic_casting, scalar_pos, tws); + fn_cache = at::cuda::jit::jit_pwise_function(code, desc.name); + } + } + + auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u}, + {num_threads(), 1u, 1u}); +} + +template +void launch_jitted_vectorized_kernel( + std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, int64_t N, array_t data, + at::cuda::jit::BinaryFuncVariant scalar_pos, + const void *scalar_val, c10::ArrayRef extra_args) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + + int tws = at::cuda::jit::calc_thread_work_size(desc.nInputs, desc.nOutputs, desc.f_inputs_type, desc.result_type); + int bws = tws * num_threads(); + // N is still int64_t for the computation, but it's always safe to cast result to int + const uint32_t grid = (N + bws - 1) / bws; + + int vec_size = at::cuda::jit::can_vectorize_up_to( + desc, c10::ArrayRef(data.data(), data.size())); + +#ifndef USE_ROCM + const auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize(); + const int optimal_vec_size = 16 / static_cast(input_size); + vec_size = std::min(optimal_vec_size, vec_size); + // Here we purposely omit vec8 for 1-byte data because of a bug in NVCC + // that causes some numerical mismatches with uint8 on sm80 and sm90. + // TODO: Revisit this after CUDA 12.8 update. + if (input_size < 2) { + vec_size = std::min(vec_size, 4); + } +#endif + + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + // fn_ptr is set to the appropriate function based on the vec size and GPU used + at::cuda::jit::NvrtcFunction* fn_ptr = nullptr; + +#ifdef USE_ROCM + if (vec_size == 16) { + fn_ptr = &fn_cache.vec16; + } else +#endif + if (vec_size == 8) { + fn_ptr = &fn_cache.vec8; + } else if (vec_size == 4) { + fn_ptr = &fn_cache.vec4; + } else if (vec_size == 2) { + fn_ptr = &fn_cache.vec2; + } else if (vec_size ==1) { + fn_ptr = &fn_cache.vec1; + } else { + TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel"); + } + + bool vectorized = vec_size > 1; + + if (!fn_ptr->function) { + const std::lock_guard lock{jiterator_mutex}; + if (!fn_ptr->function) { // cache miss! + + // Generates program + auto code = at::cuda::jit::generate_code( + desc, /*contiguous=*/true, /*dynamic_casting=*/false, + scalar_pos, tws, vectorized, vec_size); + std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name; + + // Acquires the program + *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name); + } + } + + if (vectorized) { + auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function( + *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); + } else { +// NVCC complains about unused variables l and s. +// It should be false positive in most cases, so we suppress the warnings. +#pragma nv_diagnostic push +#pragma nv_diag_suppress 177 + auto ic = TrivialOffsetCalculator(); + auto oc = TrivialOffsetCalculator<1>(); + auto l = memory::LoadWithoutCast(); + auto s = memory::StoreWithoutCast(); + + auto args = pack_kernel_args( + {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function( + *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); +#pragma nv_diagnostic pop + } +} + +template +void jitted_gpu_kernel_generic( + std::mutex &jiterator_mutex, + JittedKernelVariantCache &cache, + const at::cuda::jit::KernelDescriptor &desc, + at::cuda::jit::BinaryFuncVariant scalar_pos, + c10::ArrayRef extra_args, + TensorIteratorBase& iter, + const bool dynamic_casting, + const void *scalar_val) { + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ninputs() == arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + + constexpr int ntensors = arity + 1; + std::array data; + for (auto i : c10::irange(ntensors)) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + bool contiguous = iter.is_contiguous(); + + // Decides which of 4 kernel types to launch + // Variations are: + // - Case 1: no dynamic casting and contiguous + // - Case 2: no dynamic casting and noncontiguous + // - Case 3: dynamic casting and contiguous + // - Case 4: dynamic casting and noncontiguous + // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl + + if (!dynamic_casting) { + if (contiguous) { + // Case 1: no dynamic casting and contiguous + launch_jitted_vectorized_kernel( + jiterator_mutex, cache.vec, desc, + numel, data, scalar_pos, scalar_val, extra_args); + return; + } + + // Case 2: no dynamic casting and noncontiguous + auto input_offset_calculator = make_input_offset_calculator(iter); + auto output_offset_calculator = make_output_offset_calculator(iter); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.noncontiguous, desc, numel, data, + input_offset_calculator, output_offset_calculator, loader, + storer, contiguous, scalar_pos, scalar_val, extra_args); + return; + } + + // Cases 3 and 4 are handled below + // Both require construction of a storer (this asserts 1 output) and one or more loaders + + // Creates store cast to output (the zeroth tensor in TensorIterator) + auto storer = memory::StoreWithCast<1>(iter); + + // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors) + auto loader = memory::LoadWithCast(iter); + + if (contiguous) { + // Case 3: dynamic casting and contiguous + auto input_offset_calculator = TrivialOffsetCalculator(); + auto output_offset_calculator = TrivialOffsetCalculator<1>(); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator, + output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args); + return; + } + + // Case 4: dynamic casting and noncontiguous + auto input_offset_calculator = make_input_offset_calculator(iter); + auto output_offset_calculator = make_output_offset_calculator(iter); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator, + output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args); +} + +// NOTE: static to reduce chances of name collision. +template < + char const* name, + typename result_type, + typename f_inputs_type, + int arity, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + typename... ExtraArgs> +static void jitted_gpu_kernel_impl( + TensorIteratorBase& iter, + const std::string &f, + const bool dynamic_casting, + at::opmath_type scalar_val, + const std::tuple& extra_args) { + + // TODO: Memory use can probably be optimized by reusing kernels across GPUs with + // the same compute capability + static std::mutex jiterator_mutex; + static std::vector device_caches(c10::cuda::device_count()); + + constexpr int nInputs = arity; + constexpr int nOutputs = 1; // TODO: Support more than 1 output + static const auto desc = at::cuda::jit::make_kernel_descriptor< + result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs); + + auto &cache = device_caches[iter.device().index()]; + auto extra_args_array = tuple_to_array(extra_args); + return jitted_gpu_kernel_generic( + jiterator_mutex, + cache, + desc, + scalar_pos, + extra_args_array, + iter, + dynamic_casting, + &scalar_val + ); +} + +} // at::native + +#endif // AT_USE_JITERATOR() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..e149b2d0d0da96e7450baf4429331cf764e8629a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CompositeRandomAccessor.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at { namespace native { + +struct TupleInfoCPU { + template + using tuple = thrust::tuple; + + template + static constexpr auto tie(Types&... args) noexcept { + return thrust::tie(args...); + } +}; + +template +using CompositeRandomAccessorCPU = + CompositeRandomAccessor; + +template +void swap( + references_holder rh1, + references_holder rh2 +) { + return thrust::swap(rh1.data(), rh2.data()); +} + +template +auto get(references_holder rh) -> decltype(thrust::get(rh.data())) { + return thrust::get(rh.data()); +} + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..c0ed130b8dfabd55d9c3ca47e98b21271bd2a888 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Copy.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at { +struct TensorIteratorBase; + +namespace native { + +void direct_copy_kernel_cuda(TensorIteratorBase& iter); + +} +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h new file mode 100644 index 0000000000000000000000000000000000000000..cfdf448927baab4a590902f06867a79cc18d52ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTPlanCache.h @@ -0,0 +1,499 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native::detail { + +// Enum representing the FFT type +enum class CuFFTTransformType : int8_t { + C2C, // Complex-to-complex + R2C, // Real-to-complex + C2R, // Complex-to-real +}; + +// This struct is used to let us easily compute hashes of the +// parameters. +// It will be the **key** to the plan cache. +struct CuFFTParams +{ + int64_t signal_ndim_; // between 1 and max_rank, i.e., 1 <= signal_ndim <= 3 + // These include additional batch dimension as well. + int64_t sizes_[max_rank + 1]; + int64_t input_strides_[max_rank + 1]; + int64_t output_strides_[max_rank + 1]; + CuFFTTransformType fft_type_; + ScalarType value_type_; + + CuFFTParams() = default; + + CuFFTParams(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef signal_sizes, CuFFTTransformType fft_type, ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_sizes.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + TORCH_INTERNAL_ASSERT(in_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(out_strides.size() == signal_sizes.size()); + TORCH_INTERNAL_ASSERT(1 <= signal_ndim_ && signal_ndim_ <= max_rank); + + std::copy(signal_sizes.cbegin(), signal_sizes.cend(), sizes_); + std::copy(in_strides.cbegin(), in_strides.cend(), input_strides_); + std::copy(out_strides.cbegin(), out_strides.cend(), output_strides_); + } +}; + +static_assert(std::is_trivial_v ); + +// Returns true if the transform type has complex input +inline bool cufft_complex_input(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::C2R: + return true; + + case CuFFTTransformType::R2C: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Returns true if the transform type has complex output +inline bool cufft_complex_output(CuFFTTransformType type) { + switch (type) { + case CuFFTTransformType::C2C: + case CuFFTTransformType::R2C: + return true; + + case CuFFTTransformType::C2R: + return false; + } + TORCH_INTERNAL_ASSERT(false); +} + +// Create transform type enum from bools representing if input and output are complex +inline CuFFTTransformType GetCuFFTTransformType(bool complex_input, bool complex_output) { + if (complex_input && complex_output) { + return CuFFTTransformType::C2C; + } else if (complex_input && !complex_output) { + return CuFFTTransformType::C2R; + } else if (!complex_input && complex_output) { + return CuFFTTransformType::R2C; + } + TORCH_INTERNAL_ASSERT(false, "Real to real FFTs are not supported"); +} + + +class CuFFTHandle { + ::cufftHandle handle_; +public: + + CuFFTHandle() { + CUFFT_CHECK(cufftCreate(&handle_)); + } + + ::cufftHandle & get() { return handle_; } + const ::cufftHandle & get() const { return handle_; } + + ~CuFFTHandle() { +// Not using fftDestroy() for rocFFT to work around double freeing of handles +#if !defined(USE_ROCM) + cufftDestroy(handle_); +#endif + } +}; + +__forceinline__ +static bool is_pow_of_two(int64_t x) { + return (x & (x - 1)) == 0; +} + +using cufft_size_type = long long int; + +using CuFFTDimVector = c10::SmallVector; + +// Struct representing a tensor in CuFFT's data layout for planning transforms +// See NOTE [ cuFFT Embedded Strides ]. +struct CuFFTDataLayout { + CuFFTDimVector embed; + cufft_size_type stride, dist; + bool must_clone, simple; +}; + +// Returns a cufft embedding for a contiguous signal of the given size. +// e.g. if the input is cloned, this will be the resulting data layout +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout cufft_simple_embed(IntArrayRef sizes, bool onesided) { + CuFFTDataLayout layout; + layout.simple = true; + layout.must_clone = false; + layout.embed.assign(sizes.cbegin() + 1, sizes.cend()); + if (onesided) { + layout.embed.back() = sizes.back() / 2 + 1; + } + layout.stride = 1; + layout.dist = 1; + for (const auto& len : layout.embed) { + layout.dist *= len; + } + return layout; +} + +// Convert strides to a CuFFT embedded representation. +// If strides cannot be embedded, returns a simple layout and sets must_clone flag +// See NOTE [ cuFFT Embedded Strides ]. +inline CuFFTDataLayout as_cufft_embed(IntArrayRef strides, IntArrayRef sizes, bool onesided) { + const auto signal_ndim = strides.size() - 1; + CuFFTDataLayout layout; + auto last_stride = strides[signal_ndim]; + layout.must_clone = (last_stride <= 0); + + const auto last_dim_size = onesided ? + sizes[signal_ndim] / 2 + 1 : sizes[signal_ndim]; + const auto signal_numel = c10::multiply_integers(sizes.slice(1, sizes.size() - 2)) * last_dim_size; + + // Zero stides are not allowed, even if the batch size is one. + // If that happens just set a dummy case + if (sizes[0] == 1) { + layout.dist = signal_numel; + } else if (strides[0] == 0) { + layout.must_clone = true; + } else { + layout.dist = strides[0]; + } + + // Calculate the embedding shape, or set must_clone if the strides cannot be embedded + layout.embed.resize(signal_ndim); + for (auto i = signal_ndim - 1; !layout.must_clone && i > 0; i--) { + auto stride = strides[i]; + if (sizes[i] == 1) { + layout.embed[i] = 1; + } else if (stride > 0 && stride % last_stride == 0) { + layout.embed[i] = stride / last_stride; + last_stride = stride; + } else { + layout.must_clone = true; + } + } + + if (layout.must_clone) { + // If the input needs to be cloned, assume it will be contiguous + layout = cufft_simple_embed(sizes, onesided); + layout.must_clone = true; + } else { + layout.embed[0] = sizes[1]; + layout.stride = strides[signal_ndim]; + // Determine if layout represents a simple embedding (contiguous data) + layout.simple = [&] { + for (const auto i : c10::irange(1, signal_ndim - 1)) { + if (layout.embed[i] != sizes[i + 1]) { + return false; + } + } + + return (layout.stride == 1 && layout.dist == signal_numel && + layout.embed.back() == last_dim_size); + }(); + } + return layout; +} + +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. whether to clone input before executing the plan +// 3. the workspace size needed +// +// This class will be the **value** in the plan cache. +// It **owns** the raw plan via a unique_ptr. +class CuFFTConfig { +public: + + // Only move semantics is enough for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + CuFFTConfig(const CuFFTConfig&) = delete; + CuFFTConfig& operator=(CuFFTConfig const&) = delete; + + explicit CuFFTConfig(const CuFFTParams& params): + CuFFTConfig( + IntArrayRef(params.input_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.output_strides_, params.signal_ndim_ + 1), + IntArrayRef(params.sizes_, params.signal_ndim_ + 1), + params.fft_type_, + params.value_type_) {} + + // For complex types, strides are in units of 2 * element_size(dtype) + // sizes are for the full signal, including batch size and always two-sided + CuFFTConfig(IntArrayRef in_strides, IntArrayRef out_strides, + IntArrayRef sizes, CuFFTTransformType fft_type, ScalarType dtype): + fft_type_(fft_type), value_type_(dtype) { + + // signal sizes (excluding batch dim) + CuFFTDimVector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const int64_t batch = sizes[0]; + const int64_t signal_ndim = sizes.size() - 1; + + // Since cuFFT has limited non-unit stride support and various constraints, we + // use a flag to keep track throughout this function to see if we need to + // input = input.clone(); + +#if defined(USE_ROCM) + // clone input to avoid issues with hipfft clobering the input and failing tests + clone_input = true; +#else + clone_input = false; +#endif + + // For half, base strides on the real part of real-to-complex and + // complex-to-real transforms are not supported. Since our output is always + // contiguous, only need to check real-to-complex case. + if (dtype == ScalarType::Half) { + // cuFFT on half requires compute capability of at least SM_53 + auto dev_prop = at::cuda::getCurrentDeviceProperties(); + TORCH_CHECK(dev_prop->major >= 5 && !(dev_prop->major == 5 && dev_prop->minor < 3), + "cuFFT doesn't support signals of half type with compute " + "capability less than SM_53, but the device containing input half " + "tensor only has SM_", dev_prop->major, dev_prop->minor); + for (const auto i : c10::irange(signal_ndim)) { + TORCH_CHECK(is_pow_of_two(sizes[i + 1]), + "cuFFT only supports dimensions whose sizes are powers of two when" + " computing in half precision, but got a signal size of", + sizes.slice(1)); + } + clone_input |= in_strides.back() != 1; + } + + CuFFTDataLayout in_layout; + if (clone_input) { + in_layout = cufft_simple_embed(sizes, fft_type == CuFFTTransformType::C2R); + } else { + in_layout = as_cufft_embed(in_strides, sizes, fft_type == CuFFTTransformType::C2R); + } + auto out_layout = as_cufft_embed(out_strides, sizes, fft_type == CuFFTTransformType::R2C); + TORCH_INTERNAL_ASSERT(!out_layout.must_clone, "Out strides cannot be represented as CuFFT embedding"); + clone_input |= in_layout.must_clone; + + // Check if we can take advantage of simple data layout. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + + const bool simple_layout = in_layout.simple && out_layout.simple; + cudaDataType itype, otype, exec_type; + const auto complex_input = cufft_complex_input(fft_type); + const auto complex_output = cufft_complex_output(fft_type); + if (dtype == ScalarType::Float) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == ScalarType::Double) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == ScalarType::Half) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype); + } + + // disable auto allocation of workspace to use THC allocator + CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + // make plan + if (simple_layout) { + // If with unit-stride, we tell cuFFT by setting inembed == onembed == NULL. + // In such case, cuFFT ignores istride, ostride, idist, and odist + // by assuming istride = ostride = 1. + // + // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + } else { + CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), + in_layout.embed.data(), in_layout.stride, in_layout.dist, itype, + out_layout.embed.data(), out_layout.stride, out_layout.dist, otype, + batch, &ws_size_t, exec_type)); + } + ws_size = static_cast(ws_size_t); + } + + const cufftHandle &plan() const { return plan_ptr.get(); } + + CuFFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + bool should_clone_input() const { return clone_input; } + int64_t workspace_size() const { return ws_size; } + +private: + CuFFTHandle plan_ptr; + bool clone_input; + int64_t ws_size; + CuFFTTransformType fft_type_; + ScalarType value_type_; +}; + +#if defined(USE_ROCM) + // Note that the max plan number for CUDA version < 10 has to be 1023 + // due to a bug that fails on the 1024th plan + constexpr int64_t CUFFT_MAX_PLAN_NUM = 1023; + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM; +#else + constexpr int64_t CUFFT_MAX_PLAN_NUM = std::numeric_limits::max(); + // The default max cache size chosen for CUDA version > 10 is arbitrary. + // This number puts a limit on how big of a plan cache should we maintain by + // default. Users can always configure it via cufft_set_plan_cache_max_size. + constexpr int64_t CUFFT_DEFAULT_CACHE_SIZE = 4096; +#endif +static_assert(0 <= CUFFT_MAX_PLAN_NUM && CUFFT_MAX_PLAN_NUM <= std::numeric_limits::max(), + "CUFFT_MAX_PLAN_NUM not in size_t range"); +static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 && CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM, + "CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range"); + +// This cache assumes that the mapping from key to value never changes. +// This is **NOT** thread-safe. Please use a mutex when using it **AND** the +// value returned from try_emplace_value. +// The contract of using this cache is that try_emplace_value should only be +// used when the max_size is positive. +class CuFFTParamsLRUCache { +public: + using kv_t = typename std::pair; + using map_t = typename std::unordered_map, + typename std::list::iterator, + ParamsHash, + ParamsEqual>; + using map_kkv_iter_t = typename map_t::iterator; + + + CuFFTParamsLRUCache() : CuFFTParamsLRUCache(CUFFT_DEFAULT_CACHE_SIZE) {} + + CuFFTParamsLRUCache(int64_t max_size) { + _set_max_size(max_size); + } + + CuFFTParamsLRUCache(CuFFTParamsLRUCache&& other) noexcept : + _usage_list(std::move(other._usage_list)), + _cache_map(std::move(other._cache_map)), + _max_size(other._max_size) {} + + CuFFTParamsLRUCache& operator=(CuFFTParamsLRUCache&& other) noexcept { + _usage_list = std::move(other._usage_list); + _cache_map = std::move(other._cache_map); + _max_size = other._max_size; + return *this; + } + + // If key is in this cache, return the cached config. Otherwise, emplace the + // config in this cache and return it. + // Return const reference because CuFFTConfig shouldn't be tampered with once + // created. + const CuFFTConfig &lookup(CuFFTParams params) { + AT_ASSERT(_max_size > 0); + + map_kkv_iter_t map_it = _cache_map.find(params); + // Hit, put to list front + if (map_it != _cache_map.end()) { + _usage_list.splice(_usage_list.begin(), _usage_list, map_it->second); + return map_it->second->second; + } + + // Miss + // remove if needed + if (_usage_list.size() >= _max_size) { + auto last = _usage_list.end(); + last--; + _cache_map.erase(last->first); + _usage_list.pop_back(); + } + + // construct new plan at list front, then insert into _cache_map + _usage_list.emplace_front(std::piecewise_construct, + std::forward_as_tuple(params), + std::forward_as_tuple(params)); + auto kv_it = _usage_list.begin(); + _cache_map.emplace(std::piecewise_construct, + std::forward_as_tuple(kv_it->first), + std::forward_as_tuple(kv_it)); + return kv_it->second; + } + + void clear() { + _cache_map.clear(); + _usage_list.clear(); + } + + void resize(int64_t new_size) { + _set_max_size(new_size); + auto cur_size = _usage_list.size(); + if (cur_size > _max_size) { + auto delete_it = _usage_list.end(); + for (size_t i = 0; i < cur_size - _max_size; i++) { + delete_it--; + _cache_map.erase(delete_it->first); + } + _usage_list.erase(delete_it, _usage_list.end()); + } + } + + size_t size() const { return _cache_map.size(); } + + size_t max_size() const noexcept { return _max_size; } + + std::mutex mutex; + +private: + // Only sets size and does value check. Does not resize the data structures. + void _set_max_size(int64_t new_size) { + // We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since + // CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check + // first. + TORCH_CHECK(new_size >= 0, + "cuFFT plan cache size must be non-negative, but got ", new_size); + TORCH_CHECK(new_size <= CUFFT_MAX_PLAN_NUM, + "cuFFT plan cache size can not be larger than ", CUFFT_MAX_PLAN_NUM, ", but got ", new_size); + _max_size = static_cast(new_size); + } + + std::list _usage_list; + map_t _cache_map; + size_t _max_size; +}; + +// Since ATen is separated into CPU build and CUDA build, we need a way to call +// these functions only when CUDA is loaded. We use CUDA hooks for this purpose +// (at cuda/detail/CUDAHooks.cpp), and call the hooked functions from the actual +// native function counterparts (at native/SpectralOps.cpp), i.e., +// _cufft_get_plan_cache_max_size, _cufft_set_plan_cache_max_size +// _cufft_get_plan_cache_size, and _cufft_clear_plan_cache. +int64_t cufft_get_plan_cache_max_size_impl(DeviceIndex device_index); +void cufft_set_plan_cache_max_size_impl(DeviceIndex device_index, int64_t max_size); +int64_t cufft_get_plan_cache_size_impl(DeviceIndex device_index); +void cufft_clear_plan_cache_impl(DeviceIndex device_index); + +} // namespace at::native::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..b93f3195de0c88f9c118b3606976530b0e3c6794 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/CuFFTUtils.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace at { namespace native { + +// This means that max dim is 3 + 2 = 5 with batch dimension and possible +// complex dimension +constexpr int max_rank = 3; + +static inline std::string _cudaGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; +#if CUDA_VERSION <= 12090 + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; +#endif +#if !defined(USE_ROCM) && CUDA_VERSION <= 12090 + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; +#endif + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + default: + std::ostringstream ss; + ss << "unknown error " << error; + return ss.str(); + } +} + +static inline void CUFFT_CHECK(cufftResult error) +{ + if (error != CUFFT_SUCCESS) { + std::ostringstream ss; + ss << "cuFFT error: " << _cudaGetErrorEnum(error); + TORCH_CHECK(false, ss.str()); + } +} + +}} // at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh new file mode 100644 index 0000000000000000000000000000000000000000..396cd1e05b434a6bf50b1e559a644fb66403cc0d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DeviceSqrt.cuh @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::native { +#if defined(USE_ROCM) +// take these out when ROCm implements std:: math functions +#include +template +static __forceinline__ __device__ scalar_t device_sqrt(scalar_t val); + +template <> +__forceinline__ __device__ float device_sqrt(float val) { + return ::sqrtf(val); +} + +template <> +__forceinline__ __device__ double device_sqrt(double val) { + return ::sqrt(val); +} +#else +template +__forceinline__ __device__ double device_sqrt(scalar_t val) { + return std::sqrt(val); +} +#endif +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h new file mode 100644 index 0000000000000000000000000000000000000000..618af6054a30989438f4d265e4ffc587420fca08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/DistributionTemplates.h @@ -0,0 +1,702 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +namespace { + +// launch bounds used for kernels utilizing TensorIterator +const uint32_t block_size_bound = 256; +const uint32_t grid_size_bound = 4; +// At the time of writing, there is no curand_* call that increments the offset by more than 4. +// See: https://docs.nvidia.com/cuda/archive/11.8.0/curand/group__DEVICE.html +const uint32_t max_generator_offsets_per_curand_call = 4; + +// utility function that calculates proper philox_offset +// for distributions utilizing TensorIterator. For distributions using +// TensorIterator, we are using a grid-stride loop with each +// thread yielding one element per thread. For the edge of the grid-stride +// loop, if the tensor size is large, the unroll loop will kick in and the float4 +// from curand4 will start getting utilized (for common tensor sizes, we end up +// using rand.x from each thread). The philox_offset calculation was changed to +// (number of elements per thread * maximum generator increment per "curand_*" call), which makes +// sure that philox offset increment is not less than the number of randoms used +// in each thread. +std::tuple calc_execution_policy(const int64_t total_elements, const uint32_t unroll_factor) { + const uint64_t numel = static_cast(total_elements); + const uint32_t block_size = block_size_bound; + dim3 dim_block(block_size); + dim3 grid((numel + block_size - 1) / block_size); + uint32_t blocks_per_sm = at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor / block_size; + grid.x = std::min( + static_cast(at::cuda::getCurrentDeviceProperties()->multiProcessorCount) * blocks_per_sm, + grid.x); + //number of times random will be generated per thread, to offset philox counter in thc random state + uint64_t counter_offset = ((numel - 1) / (block_size * grid.x * unroll_factor) + 1) * max_generator_offsets_per_curand_call; + return std::make_tuple(counter_offset, grid, dim_block); +} + +// grid stride loop kernel for distributions +template +C10_LAUNCH_BOUNDS_2(block_size_bound, grid_size_bound) +__global__ void distribution_elementwise_grid_stride_kernel(int64_t numel, + PhiloxCudaState philox_args, + const dist_t dist_func, + const transform_t transform_func) { + auto [seed, offset] = at::cuda::philox::unpack(philox_args); + int64_t idx = ((int64_t) blockIdx.x) * blockDim.x + threadIdx.x; + curandStatePhilox4_32_10_t state; + curand_init(seed, idx, offset, &state); + + int64_t rounded_size = ((numel - 1)/(blockDim.x * gridDim.x * unroll_factor)+1) * + blockDim.x * gridDim.x * unroll_factor; + for(int64_t linear_index = idx; linear_index < rounded_size; linear_index += blockDim.x * gridDim.x * unroll_factor) { + auto rand = dist_func(&state); + #pragma unroll + for (int ii = 0; ii < unroll_factor; ii++) { + int64_t li = linear_index + blockDim.x * gridDim.x * ii; + if (li < numel) { + transform_func(li, static_cast((&rand.x)[ii])); + } + } + __syncthreads(); + } +} + +/** + * distribution_nullary_kernel is analogous to gpu_kernel in + * ATen/native/cuda/Loops.cuh. Like gpu_kernel, it uses + * TensorIterator to launch a kernel. However, the differences are + * - it launches a grid-stride loop based kernel. The kernel is not + * generic like elementwise_kernel in Loops.cuh and is specialized + * for the distribution kernels here. + * - For big size tensors, we can launch multiple kernels recursively + * (i.e. if (!iter.can_use_32bit_indexing())) and hence, the philox + * offset calculation is done in this function. + * + * FIXME: Can we specialize elementwise_kernel and launch_kernel in Loops.cuh + * to have grid-stride loop kernel and then use that to launch our distribution + * kernels? Note that we need a grid-stride loop kernel because, we found by testing + * that it achieves peak effective bandwidth. + */ +template +void distribution_nullary_kernel(at::TensorIteratorBase& iter, + RNG gen, + const dist_t& dist_func, + const transform_t transform_func) { + const int unroll_factor = sizeof(dist_func_return_t) / sizeof(accscalar_t); + TORCH_CHECK(unroll_factor >= 1, "unroll_factor must be >= 1."); + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + auto [counter_offset, grid, block] = calc_execution_policy(numel, unroll_factor); + PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(counter_offset); + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_nullary_kernel(sub_iter, + gen, dist_func, transform_func); + } + return; + } + + char* out_data = (char*)iter.data_ptr(0); + + auto stream = at::cuda::getCurrentCUDAStream(); + if (iter.is_trivial_1d()) { + auto strides = iter.get_inner_strides(); + int stride0 = strides[0]; + distribution_elementwise_grid_stride_kernel<<>>( + numel, + rng_engine_inputs, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + scalar_t* out = (scalar_t*)&out_data[stride0 * idx]; + *out = transform_func(rand); + } + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + auto offset_calc = make_offset_calculator<1>(iter); + distribution_elementwise_grid_stride_kernel<<>>( + numel, + rng_engine_inputs, + dist_func, + [=]__device__(int idx, accscalar_t rand) { + auto offsets = offset_calc.get(idx); + scalar_t* out = (scalar_t*)&out_data[offsets[0]]; + *out = transform_func(rand); + } + ); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +// Binary kernel +template +__global__ void distribution_binary_elementwise_kernel( + int numel, + func_t f, + PhiloxCudaState philox_args, + typename function_traits::result_type *output_data, + const typename function_traits::template arg<1>::type *input_data_1, + const typename function_traits::template arg<2>::type *input_data_2, + inp_offset_calc_t inp_calc, + out_offset_calc_t out_calc) { + auto seeds = at::cuda::philox::unpack(philox_args); + + using input_t_1 = typename function_traits::template arg<1>::type; + using input_t_2 = typename function_traits::template arg<2>::type; + + input_t_1 inputs_1[thread_work_size()]; + input_t_2 inputs_2[thread_work_size()]; + + int base_index = block_work_size() * blockIdx.x; + int remaining = std::min(numel - base_index, block_work_size()); + + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + blockIdx.x * blockDim.x + threadIdx.x, + std::get<1>(seeds), + &state); + + // load data into registers + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + break; + } + int input_idx = thread_idx + base_index; + auto offsets = inp_calc.get(input_idx); + inputs_1[i] = input_data_1[offsets[0]]; + inputs_2[i] = input_data_2[offsets[1]]; + + thread_idx += num_threads(); + } + + // compute and store + thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + break; + } + int input_idx = thread_idx + base_index; + auto offsets = out_calc.get(input_idx); + output_data[offsets[0]] = f(state, inputs_1[i], inputs_2[i]); + thread_idx += num_threads(); + } +} + +template +void distribution_binary_kernel(TensorIteratorBase &iter, PhiloxCudaState philox_args, const func_t &f) { + static_assert(std::is_same_v::template arg<0>::type, curandStatePhilox4_32_10_t&>, "the first argument of functor must be curandStatePhilox4_32_10_t"); + using input_t_1 = typename function_traits::template arg<1>::type; + using input_t_2 = typename function_traits::template arg<2>::type; + using output_t = typename function_traits::result_type; + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + distribution_binary_kernel(sub_iter, philox_args, f); + } + return; + } + + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(iter.can_use_32bit_indexing()); + + int64_t numel = iter.numel(); + if (numel == 0) { + return; + } + + output_t *output_data = static_cast(iter.data_ptr(0)); + const input_t_1 *input_data_1 = static_cast(iter.data_ptr(1)); + const input_t_2 *input_data_2 = static_cast(iter.data_ptr(2)); + + int64_t grid = (numel + block_work_size() - 1) / block_work_size(); + auto stream = at::cuda::getCurrentCUDAStream(); + + if (iter.is_contiguous()) { + distribution_binary_elementwise_kernel<<>>( + numel, f, philox_args, output_data, input_data_1, input_data_2, + TrivialOffsetCalculator<2>(), TrivialOffsetCalculator<1>()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + distribution_binary_elementwise_kernel<<>>( + numel, f, philox_args, output_data, input_data_1, input_data_2, + make_input_offset_calculator<2>(iter), make_output_offset_calculator(iter)); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +} // namespace +}} // namespace at::native + + +namespace at { +namespace native { +namespace templates { +namespace cuda { + +// ==================================================== Random ======================================================== + +template +void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) { +#ifdef FBCODE_CAFFE2 + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { + if (( + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) && range >= 1ULL << 32) + { + // define lambda to mod with range and add base + auto random_func = [range, base] __device__ (uint64_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [range, base] __device__ (uint32_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> uint4 { + return curand4(state); + }, + random_func); + } + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +#else + AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { + if (range >= 1ULL << 28) // allow approx 5% skew in uniform int generation using % + { + // define lambda to mod with range and add base + auto random_func = [range, base] __device__ (uint64_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [range, base] __device__ (uint32_t rand) { + return transformation::uniform_int_from_to(rand, range, base); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> uint4 { + return curand4(state); + }, + random_func); + } + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); +#endif +} + +// This is the special kernel to handle single specific case: +// from(inclusive) = std::numeric_limits::lowest() +// to(exclusive) = None (= std::numeric_limits::max() + 1) +template +void random_full_64_bits_range_kernel(TensorIteratorBase& iter, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND(at::ScalarType::BFloat16, iter.dtype(), "random_full_64_bits_range_kernel_cuda", [&] { + if (std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) { + auto random_func = [] __device__ (uint64_t rand) { + return transformation::uniform_int_full_range(rand); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + TORCH_CHECK(false, "random_full_64_bits_range_kernel_cuda handles only int64, double, float and bfloat16"); + } + }); +} + +template +struct RandomFromToKernel { + void operator()(TensorIteratorBase& iter, uint64_t range, int64_t base, std::optional gen) { + random_from_to_kernel(iter, range, base, check_generator(gen)); + } + void operator()(TensorIteratorBase& iter, std::optional gen) { + random_full_64_bits_range_kernel(iter, check_generator(gen)); + } +}; + +template +void random_kernel(TensorIteratorBase& iter, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND3(at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "random_kernel_cuda", [&] { + if (std::is_same_v || std::is_same_v) { + auto random_func = [] __device__ (uint64_t rand) { + return transformation::uniform_int(rand); + }; + distribution_nullary_kernel(iter, gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> ulonglong2 { + ulonglong2 ret; + uint4 rand_val = curand4(state); + ret.x = (static_cast(rand_val.x) << 32) | rand_val.y; + ret.y = (static_cast(rand_val.z) << 32) | rand_val.w; + return ret; + }, + random_func); + } else { + auto random_func = [] __device__ (uint32_t rand) { + return transformation::uniform_int(rand); + }; + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> uint4 { + return curand4(state); + }, + random_func); + } + }); +} + +template +struct RandomKernel { + void operator()(TensorIteratorBase& iter, RNG gen) { + random_kernel(iter, gen); + } +}; + +// ==================================================================================================================== + +template +void uniform_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) { + if (std::is_same_v) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> double2 { return curand_uniform2_double(state); }, + transform); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> float4 { return curand_uniform4(state); }, + transform); + } +} + +template +void normal_and_transform(TensorIteratorBase& iter, RNG gen, transform_t transform) { + if (std::is_same_v) { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> double2 { return curand_normal2_double(state); }, + transform); + } else { + distribution_nullary_kernel(iter, + gen, + [] __device__ (curandStatePhilox4_32_10_t* state) -> float4 { return curand_normal4(state); }, + transform); + } +} + +// ==================================================== Normal ======================================================== + +template +void normal_kernel(const TensorBase &self, double mean_, double std_, RNG gen) { + auto iter = TensorIterator::borrowing_nullary_op(self); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "normal_kernel_cuda", [&] { + using accscalar_t = at::acc_type; + auto mean = static_cast(mean_); + auto std = static_cast(std_); + // define lambda to multiply std and add mean + auto normal_func = [mean, std] __device__ (accscalar_t rand) { + return static_cast(transformation::normal(rand, mean, std)); + }; + normal_and_transform(iter, gen, normal_func); + }); +} + +template +struct NormalKernel { + void operator()(const TensorBase &self, double mean, double std, std::optional gen) { + normal_kernel(self, mean, std, check_generator(gen)); + } +}; + +// ==================================================== Uniform ======================================================== + +template +void uniform_kernel(TensorIteratorBase& iter, double from_, double to_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "uniform_kernel_cuda", [&] { + auto from = static_cast(from_); + auto to = static_cast(to_); + using opmath_t = at::opmath_type; + auto range = static_cast(to-from); + // define lambda to reverse bounds, multiply 'range' and add 'from_' + auto uniform_func = [range, from, to] __device__ (opmath_t rand) { + // Compute output value before reversing the bounds + // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/96947 + auto value = static_cast(rand * range + from); + // reverse the bounds of curand4 from (0, 1] to [0, 1) + // Note that this method is from legacy THCTensorRandom and is likely to give + // you more 0-s, since, the probability of getting 1-s is higher than 0-s and + // by reversing the bounds, we are flipping the probabilities of 1-s and 0-s. + // BEFORE TOUCHING THIS CODE READ: https://github.com/pytorch/pytorch/issues/16706 + auto reverse_bound_value = value == to ? from : value; + return reverse_bound_value; + }; + uniform_and_transform(iter, gen, uniform_func); + }); +} + +template +struct UniformKernel { + void operator()(TensorIteratorBase& iter, double from, double to, std::optional gen) { + uniform_kernel(iter, from, to, check_generator(gen)); + } +}; + +// ================================================== LogNormal ======================================================= + +template +void log_normal_kernel(TensorIteratorBase& iter, double mean_, double std_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "log_normal_cuda", [&] { + using accscalar_t = at::acc_type; + auto mean = static_cast(mean_); + auto std = static_cast(std_); + // define lambda for log_normal transformation + auto log_normal_func = [mean, std] __device__ (accscalar_t rand) { + return static_cast(transformation::log_normal(transformation::normal(rand, mean, std))); + }; + normal_and_transform(iter, gen, log_normal_func); + }); +} + +template +struct LogNormalKernel { + void operator()(TensorIteratorBase& iter, double mean, double std, std::optional gen) { + log_normal_kernel(iter, mean, std, check_generator(gen)); + } +}; + +// =================================================== Geometric ====================================================== + +template +void geometric_kernel(TensorIteratorBase& iter, double p, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "geometric_cuda", [&] { + using accscalar_t = at::DiscreteDistributionType::type; + // define lambda for geometric transformation + auto geometric_func = [p] __device__ (accscalar_t rand) { + return static_cast(transformation::geometric(rand, p)); + }; + uniform_and_transform(iter, gen, geometric_func); + }); +} + +template +struct GeometricKernel { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { + geometric_kernel(iter, p, check_generator(gen)); + } +}; + +// ================================================== Exponential ===================================================== + +template +void exponential_kernel(TensorIteratorBase& iter, double lambda_, RNG gen) { + TORCH_CHECK(isFloatingType(iter.dtype()), "Exponential distribution is a continuous probability distribution. dtype must be a floating point but you specified ", iter.dtype()); + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "exponential_cuda", [&] { + using accscalar_t = at::acc_type; + auto lambda = static_cast(lambda_); + // define lambda for exponential transformation + auto exponential_func = [lambda] __device__ (accscalar_t rand) { + return static_cast(transformation::exponential(rand, lambda)); + }; + uniform_and_transform(iter, gen, exponential_func); + }); +} + +template +struct ExponentialKernel { + void operator()(TensorIteratorBase& iter, double lambda, std::optional gen) { + exponential_kernel(iter, lambda, check_generator(gen)); + } +}; + +// ==================================================== Cauchy ======================================================== + +template +void cauchy_kernel(TensorIteratorBase& iter, double median_, double sigma_, RNG gen) { + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, iter.dtype(), "cauchy_cuda", [&] { + using accscalar_t = at::acc_type; + auto median = static_cast(median_); + auto sigma = static_cast(sigma_); + // define lambda for cauchy transformation + auto cauchy_func = [median, sigma] __device__ (accscalar_t rand) { + return static_cast(transformation::cauchy(rand, median, sigma)); + }; + uniform_and_transform(iter, gen, cauchy_func); + }); +} + +template +struct CauchyKernel { + void operator()(TensorIteratorBase& iter, double median, double sigma, std::optional gen) { + cauchy_kernel(iter, median, sigma, check_generator(gen)); + } +}; + +// ==================================================== Bernoulli ===================================================== + +template +void bernoulli_tensor_cuda_kernel( + const TensorBase &ret, const at::TensorBase &p, + PhiloxCudaState philox_args) { + auto functor = [philox_args] __device__( + int n, scalar_t& v1, scalar_t& v2, scalar_t& v3, scalar_t& v4, + const prob_t& p1, const prob_t& p2, const prob_t& p3, const prob_t& p4) { + auto seeds = at::cuda::philox::unpack(philox_args); + curandStatePhilox4_32_10_t state; + curand_init(std::get<0>(seeds), + blockIdx.x * blockDim.x + threadIdx.x, + std::get<1>(seeds), + &state); + + // See Note [Register spilling in curand call for CUDA < 10] + float4 rand = curand_uniform4(&state); + switch (n) { + case 4: { + CUDA_KERNEL_ASSERT(0 <= p4 && p4 <= 1); + v4 = static_cast(rand.w <= p4); + [[fallthrough]]; + } + case 3: { + CUDA_KERNEL_ASSERT(0 <= p3 && p3 <= 1); + v3 = static_cast(rand.z <= p3); + [[fallthrough]]; + } + case 2: { + CUDA_KERNEL_ASSERT(0 <= p2 && p2 <= 1); + v2 = static_cast(rand.y <= p2); + [[fallthrough]]; + } + case 1: { + CUDA_KERNEL_ASSERT(0 <= p1 && p1 <= 1); + v1 = static_cast(rand.x <= p1); + } + } + }; + // The template argument `4` below indicates that we want to operate on four + // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details. + at::cuda::CUDA_tensor_apply2(ret, p, functor); +} + +template +void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG gen) { + PhiloxCudaState rng_engine_inputs; + { + // See Note [Acquire lock when using random generators] + std::lock_guard lock(gen->mutex_); + rng_engine_inputs = gen->philox_cuda_state(10); + } + TORCH_CHECK(at::isFloatingType(p_.scalar_type()), "expected probabilities tensor to have floating type, got ", p_.scalar_type()); + // cast probabilities tensor to double for double `self` tensor, and to `float` for everything else + const auto p_type = self.dtype() == at::kDouble ? at::kDouble : at::kFloat; + auto p_cuda = p_.to(TensorOptions().device(self.device()).dtype(p_type)); + auto p = expand_inplace(self, p_cuda); + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "bernoulli_tensor_cuda_self_", [&] { + if (std::is_same_v) { + return bernoulli_tensor_cuda_kernel(self, *p, rng_engine_inputs); + } else { + return bernoulli_tensor_cuda_kernel(self, *p, rng_engine_inputs); + } + }); +} + +template +void bernoulli_kernel(TensorIteratorBase& iter, double p, RNG gen) { + AT_DISPATCH_ALL_TYPES_AND3( + at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, iter.dtype(), "bernoulli_scalar_cuda_", [&] { + using accscalar_t = at::DiscreteDistributionType::type; + // define lambda for bernoulli transformation + auto bernoulli_func = [p] __device__ (accscalar_t rand) { + return static_cast(transformation::bernoulli(rand, p)); + }; + uniform_and_transform(iter, gen, bernoulli_func); + }); +} + +template +struct BernoulliKernel { + void operator()(TensorIteratorBase& iter, double p, std::optional gen) { + bernoulli_kernel(iter, p, check_generator(gen)); + } + void operator()(const TensorBase &self, const TensorBase &p_, std::optional gen) { + bernoulli_kernel(self, p_, check_generator(gen)); + } +}; + +}}}} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Distributions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Distributions.h new file mode 100644 index 0000000000000000000000000000000000000000..11deb513c87a8beb6e44a7ff550b4c39918ecc51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Distributions.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at { +struct CUDAGeneratorImpl; +struct TensorIteratorBase; +class TensorBase; + +namespace native { + +void launch_poisson_cuda_kernel( + const TensorBase &ret, const TensorBase &lambda, CUDAGeneratorImpl *gen); + +void launch_gamma_kernel( + const TensorBase &ret, const TensorBase &alpha, CUDAGeneratorImpl *gen); + +void launch_binomial_cuda_kernel( + TensorIteratorBase &iter, CUDAGeneratorImpl *gen); + +void launch_dirichlet_kernel(TensorIteratorBase &iter); + +void launch_standard_gamma_grad_kernel(TensorIteratorBase &iter); + +void launch_dirichlet_grad_kernel(TensorIteratorBase &iter); + +}} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..d155eb420ebbfdff544f613ca3d7f3dc5bd58d88 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/EmbeddingBackwardKernel.cuh @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +namespace at::native { + +Tensor embedding_backward_cuda_kernel( + const Tensor &grad, + const Tensor &orig_indices, + const Tensor &sorted_indices, + const Tensor &count, + int64_t num_weights, + int padding_idx = -1, + bool mode_mean = false, + const Tensor &offset2bag = Tensor(), + const Tensor &bag_size = Tensor(), + const Tensor &per_sample_weights = Tensor()); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh new file mode 100644 index 0000000000000000000000000000000000000000..461ea60b8fda7b89edaa0d04c39f4a9c046bcb87 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ForeachFunctors.cuh @@ -0,0 +1,743 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +namespace at::native { + +namespace { + +// TODO(crcrpar): Handle version bump in codegen. +// rel: +// https://github.com/pytorch/pytorch/blob/9cf84347767c8abb8feba18a9a1baba321eeb8b9/tools/autograd/gen_inplace_or_view_type.py#L481-L482 +inline void increment_version(TensorList tensors) { + for (const auto& t : tensors) { + t.unsafeGetTensorImpl()->bump_version(); + } +} + +// Initializes args and checks if all args are aligned +template +__device__ bool init_args( + T** args, + TensorListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +// Initializes args and checks if all args are aligned +template +__device__ bool init_args( + T** args, + TensorListScalarListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +template +__device__ bool init_args( + T** args, + FusedOptimizerTensorListMetadata& tl, + const int64_t chunk_idx, + const int64_t chunk_size, + const int64_t tensor_loc) { + bool all_aligned = true; + for (int i = 0; i < depth; i++) { + args[i] = (T*)tl.addresses[i][tensor_loc]; + args[i] += chunk_idx * chunk_size; + + if (!is_aligned(args[i])) { + all_aligned = false; + } + } + return all_aligned; +} + +template +__device__ void load_args( + T r_args[][kILP], + T** args, + const int64_t i_start, + const int64_t chunk_size, + const int64_t n) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + const auto i = i_start + threadIdx.x + ii * blockDim.x; + for (int r_index = 0; r_index < depth; r_index++) { + r_args[r_index][ii] = 0; + if (i < n && i < chunk_size) { + r_args[r_index][ii] = args[r_index][i]; + } + } + } +} + +template +__device__ void store_args( + T* dst, + T* src, + const int64_t i_start, + const int64_t chunk_size, + const int64_t n) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + const int64_t i = i_start + threadIdx.x + ii * blockDim.x; + if (i < n && i < chunk_size) + dst[i] = src[ii]; + } +} + +template +__device__ __forceinline__ void binary_op_scalar( + T r_args[][kILP], + T** args, + opmath_t scalar, + const int64_t n, + const int64_t chunk_size, + const bool all_aligned, + Op op) { + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(scalar))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 1 (for inplace) or 2 (for out of place), r_args + // has depth 1 + load_args<1>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(scalar))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } +} + +template +__device__ __forceinline__ void pointwise_op_scalar( + T r_args[][kILP], + T** args, + opmath_t scalar, + const int64_t n, + const int64_t chunk_size, + const bool all_aligned, + Op op) { + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); + load_store(r_args[2], args[2], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + static_cast(r_args[0][ii]) + + scalar * + op(static_cast(r_args[1][ii]), + static_cast(r_args[2][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 3 (for inplace) or 4 (for out of place), r_args + // has depth 3 + load_args<3>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + static_cast(r_args[0][ii]) + + scalar * + op(static_cast(r_args[1][ii]), + static_cast(r_args[2][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } +} + +// +// Binary Functors +// +template +struct BinaryOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t scalar) { + const int tensor_loc = tl.block_to_tensor[blockIdx.x]; + const int chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + binary_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct BinaryOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + opmath_t scalar = tl.scalar_vals[tensor_loc]; + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + binary_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct BinaryOpListAlphaFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t alpha) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + alpha * static_cast(r_args[1][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + alpha * static_cast(r_args[1][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct BinaryOpScalarTensorFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + T* scalar, + opmath_t alpha) { + const int tensor_loc = tl.block_to_tensor[blockIdx.x]; + const int chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast(op( + static_cast(r_args[0][ii]), + static_cast(alpha) * static_cast(*scalar))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + // Regardless if depth is 1 (for inplace) or 2 (for out of place), + // r_args has depth 1 + load_args<1>(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast(op( + static_cast(r_args[0][ii]), + static_cast(alpha) * static_cast(*scalar))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +// +// Unary Functors +// + +template +struct ZeroFunctor { + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata<1>& tl) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const auto all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = 0; + } + // store + load_store(args[0], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = 0; + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct UnaryOpFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + static_cast(op(static_cast(r_args[0][ii]))); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + static_cast(op(static_cast(r_args[0][ii]))); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +// +// Pointwise Functors +// + +template +struct PointwiseOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t scalar) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + pointwise_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct PointwiseOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + opmath_t scalar = tl.scalar_vals[tensor_loc]; + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + pointwise_op_scalar( + r_args, args, scalar, n, chunk_size, all_aligned, op); + } +}; + +template +struct PointwiseOpListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[depth - 1][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]))); + } + // store + load_store(args[2], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = static_cast( + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]))); + } + store_args(args[2], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op) { + static_assert(depth == 3 || depth == 4, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); + load_store(r_args[2], args[2], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + static_cast(r_args[2][ii])); + } + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + static_cast(r_args[2][ii])); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpScalarFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListMetadata& tl, + Op op, + opmath_t alpha) { + static_assert(depth == 2 || depth == 3, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + alpha); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + alpha); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct TernaryOpScalarListFunctor { + using opmath_t = at::opmath_type; + template + __device__ __forceinline__ void operator()( + int64_t chunk_size, + TensorListScalarListMetadata& tl, + Op op) { + static_assert(depth == 2 || depth == 3, ""); + static_assert(depth >= r_args_depth, ""); + static_assert(res_arg_index == depth - 1 || res_arg_index == 0, ""); + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + auto n = tl.numel_for_tensor[tensor_loc]; + + T* args[depth]; + const bool all_aligned = + init_args(args, tl, chunk_idx, chunk_size, tensor_loc); + n -= chunk_idx * chunk_size; + T r_args[r_args_depth][kILP]; + const opmath_t scalar = tl.scalar_vals[tensor_loc]; + + // to make things simple, we put aligned case in a different code path + if (n % kILP == 0 && chunk_size % kILP == 0 && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + // load + load_store(r_args[0], args[0], 0, i_start); + load_store(r_args[1], args[1], 0, i_start); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + scalar); + } + // store + load_store(args[res_arg_index], r_args[0], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + r_args[0][ii] = + op(static_cast(r_args[0][ii]), + static_cast(r_args[1][ii]), + scalar); + } + store_args(args[res_arg_index], r_args[0], i_start, chunk_size, n); + } + } + } +}; + +template +struct power_functor { + C10_DEVICE T operator()(const T& a, const T& b) const { + return at::native::pow_(a, b); + } +}; + +template +struct reverse_power_functor { + C10_DEVICE T operator()(const T& a, const T& b) const { + return at::native::pow_(b, a); + } +}; + +} // namespace +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh new file mode 100644 index 0000000000000000000000000000000000000000..b7fca1a7e66a393118ce90102a8fae071e2f7fb3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GridSampler.cuh @@ -0,0 +1,326 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, +// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). +// if align_corners: -1 and +1 get sent to the centers of the corner pixels +// -1 --> 0 +// +1 --> (size - 1) +// scale_factor = (size - 1) / 2 +// if not align_corners: -1 and +1 get sent to the image edges +// -1 --> -0.5 +// +1 --> (size - 1) + 0.5 == size - 0.5 +// scale_factor = size / 2 +template +__forceinline__ __device__ +scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1.f) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1.f) * size - 1) / 2; + } +} + +// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize +// except that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size, + bool align_corners, scalar_t *grad_in) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1.f) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + *grad_in = static_cast(size) / 2; + return ((coord + 1.f) * size - 1) / 2; + } +} + +// Clips coordinates to between 0 and clip_limit - 1 +template +__forceinline__ __device__ +scalar_t clip_coordinates(scalar_t in, int clip_limit) { + return ::min(static_cast(clip_limit - 1), ::max(in, static_cast(0))); +} + +// clip_coordinates_set_grad works similarly to clip_coordinates except that +// it also returns the `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) { + // Note that it is important for the gradient calculation that borders + // are considered out of bounds. + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + scalar_t max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +// Reflects coordinates until they fall between low and high (inclusive). +// The bounds are passed as twice their value so that half-integer values +// can be represented as ints. +template +__forceinline__ __device__ +scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = ::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + scalar_t extra = ::fmod(in, span); + int flips = static_cast(::floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +// reflect_coordinates_set_grad works similarly to reflect_coordinates except +// that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high, + scalar_t *grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + // `fmod` returns same sign as `in`, which is positive after the `if` above. + scalar_t extra = ::fmod(in, span); + int flips = static_cast(::floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +template +__forceinline__ __device__ +scalar_t safe_downgrade_to_int_range(scalar_t x){ + // -100.0 does not have special meaning. This is just to make sure + // it's not within_bounds_2d or within_bounds_3d, and does not cause + // undefined behavior. See #35506. + if (x > INT_MAX-1 || x < INT_MIN || !::isfinite(static_cast(x))) + return static_cast(-100.0); + return x; +} + +template +__forceinline__ __device__ +scalar_t compute_coordinates(scalar_t coord, int size, + GridSamplerPadding padding_mode, + bool align_corners) { + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates(coord, 0, 2*(size - 1)); + } else { + coord = reflect_coordinates(coord, -1, 2*size - 1); + } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + + coord = safe_downgrade_to_int_range(coord); + return coord; +} + +// Computes the pixel source index value for a grid coordinate +template +__forceinline__ __device__ +scalar_t grid_sampler_compute_source_index( + scalar_t coord, + int size, + GridSamplerPadding padding_mode, + bool align_corners) { + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +// grid_sampler_compute_source_index_set_grad works similarly to +// grid_sampler_compute_source_index except that it also returns the +// `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +__forceinline__ __device__ +scalar_t grid_sampler_compute_source_index_set_grad( + scalar_t coord, + int size, + GridSamplerPadding padding_mode, + bool align_corners, + scalar_t *grad_in) { + scalar_t grad_clip, grad_refl; + coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); + } else { + coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); + } + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + + coord = safe_downgrade_to_int_range(coord); + return coord; +} + +__forceinline__ __device__ +bool within_bounds_2d(int h, int w, int H, int W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +__forceinline__ __device__ +bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; +} + +template +__forceinline__ __device__ +scalar_t get_value_bounded( + const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int ix = static_cast(x); + int iy = static_cast(y); + + if (within_bounds_2d(iy, ix, H, W)) { + return data[iy * sH + ix * sW]; + } + return static_cast(0); +} + +template +__forceinline__ __device__ +void safe_add_2d(scalar_t *data, int h, int w, + int sH, int sW, int H, int W, + scalar_t delta, + const index_t NC_offset, + const index_t memory_span) { + if (within_bounds_2d(h, w, H, W)) { + fastAtomicAdd(data, + NC_offset + h * sH + w * sW, + memory_span, + delta, + true); + } +} + +template +__forceinline__ __device__ +void safe_add_3d(scalar_t *data, int d, int h, int w, + int sD, int sH, int sW, int D, int H, int W, + scalar_t delta, + const index_t NC_offset, + const index_t memory_span) { + if (within_bounds_3d(d, h, w, D, H, W)) { + fastAtomicAdd(data, + NC_offset + d * sD + h * sH + w * sW, + memory_span, + delta, + true); + } +} + +template +__forceinline__ __device__ +void add_value_bounded( + scalar_t* data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + scalar_t delta, + GridSamplerPadding padding_mode, + bool align_corners, + const index_t NC_offset, + const index_t memory_span) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int ix = static_cast(x); + int iy = static_cast(y); + + safe_add_2d(data, iy, ix, sH, sW, H, W, delta, NC_offset, memory_span); +} + +// Calculate the differential of the cubic convolution, i.e. `d coeff / d x` +template +__forceinline__ __device__ +void get_cubic_coefficients_grad( + scalar_t coeffs[4], + scalar_t t) { + + // Must be the same as forward calculation in + // aten/src/ATen/native/cuda/UpSample.cuh:get_cubic_upsample_coefficients + scalar_t A = -0.75; + + scalar_t x; + x = -1 - t; // 1 < x = |-1 - tx| < 2 + coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A; + x = -t; // x = |0 - tx| <= 1 + coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 1 - t; // x = |1 - tx| <= 1 + coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 2 - t; // 1 < x = |2 - tx| < 2 + coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A; +} + + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMM.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMM.h new file mode 100644 index 0000000000000000000000000000000000000000..bb4431cb332fc6cddd3eb0005c6ec97ff9ec6b01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/GroupMM.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::cuda::detail { +TORCH_API void bf16bf16_grouped_mm( + at::Tensor mat_a, // bf16 + at::Tensor mat_b, // bf16 + std::optional offs, + std::optional bias, // BF16 + at::Tensor& out); +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..47f808b8f8810d48dce14e3386639a118d99ad10 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernel.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +struct TensorIteratorBase; +class TensorBase; +} + +namespace at::native { +/// @param maskPrefixSum[in,out] +void launch_masked_scatter_kernel( + const TensorBase &self, const TensorBase &mask, + const TensorBase &maskPrefixSum, const TensorBase &source); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernelUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..d5053ac466b193ddca34470d7f859624b3668669 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/IndexKernelUtils.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#include +#include +#include + +namespace at::native { + +template +inline bool fast_gather_kernel_eligible(const TensorIterator& iter, char * const out_ptr, char * const in_ptr, const size_t index_stride_bytes, const size_t element_size) { + using at::native::memory::get_alignment; + const auto index_element_size = iter.element_size(2); + //TensorIterator strides and sizes are ordered fastest moving to slowest moving, + //in contrast to regular sizes + // we need contiguous source and dst slices and aligned pointers and strides and slice size to do vectorized loads + // also we need idx to be expanded in the last dimension so we can copy entire slices + // and we need the src tensor to keep 0 stride from restriding + // (it could have been deleted by dimension collapse, in this case iterator would still be 2d + // but we cannot use fast path) + + return iter.ndim() == 2 && iter.strides(2)[0]==0 && iter.strides(2)[1]==index_element_size && + static_cast(iter.strides(0)[0])==element_size && + static_cast(iter.strides(1)[0])==element_size && static_cast(iter.strides(1)[1] == 0) && + get_alignment(out_ptr) == alignment && get_alignment(in_ptr) == alignment && + get_alignment(static_cast(iter.shape()[0] * element_size)) == alignment && + get_alignment(static_cast(index_stride_bytes)) == alignment && + get_alignment(static_cast(iter.strides(0)[1])) == alignment; +} + +template +void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int num_ind, + int64_t slice_size_in_bytes, int64_t ind_dim_size, int64_t inp_stride_bytes, int64_t out_stride_bytes, + bool allow_neg_indices=false); + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..efd0967b118f633bc15aba5f836bb785a34fceef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/JitLoops.cuh @@ -0,0 +1,191 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#if AT_USE_JITERATOR() + +#include + +#include +#include +#include + +#include + +#include + +namespace at::native { + +/* Note [Jiterator] +The "jiterator" simply just-in-time compiles the same kernels that +Loops.cuh (and CUDALoops.cuh) usually build. This reduces build time, +build size, and initial CUDA context size. + +By default on non-Windows systems, it also caches compiled kernels in ~/.cache/torch/kernels. +This behavior is controlled with two environment variables: + - USE_PYTORCH_KERNEL_CACHE, if set to zero then this will disable all cache use + - PYTORCH_KERNEL_CACHE_PATH, if set specifies the folder to use for cached kernels + +The jiterator currently has some limitations, however. It cannot: + - handle math on complex datatypes + - handle kernels with scalar parameters + +These improvements will likely come soon. + +For examples of how to use the jiterator see the i1 and gcd kernel +implementations, which pass jittable strings implementing their +operations instead of the typical CUDA functors. + +To pass a runtime argument (similar to lambda captures in non-JIT kernels), +we need to pass to additional arguments to `jitted_gpu_kernel` by value. +Currently only primitive C++ types used for computation are valid. +The order of these extra arguments should be same as the order they appear +in kernel's function signature. (look at polygamma for example) + +NOTE: One big restriction being that these arguments should be after the +arguments provided by TensorIterator. Eg. While capturing `n`, where +`scalar_t x` and `scalar_t y` are provided by TensorIterator, +* foo(scalar_t x, scalar_t y, int n) works! +* foo(int n, scalar_t x, scalar_y) doesn't work +* foo(scalar_t x, int n, scalar_y) doesn't work + +*/ + +// Entrypoint for jitted GPU kernels. +// Only handles elementwise unary and binary kernels with a +// common dtype and a single output. +// NOTE: this assumes the op's iterator has a common_dtype. +// NOTE: We use std::tuple instead of parameter pack +// for `extra_args` due to following +// bug on older versions of clang +// https://bugs.llvm.org/show_bug.cgi?id=23029 +template < + char const* name, + typename return_type, + typename f_inputs_type, + int arity, + typename... Args> +void jitted_gpu_kernel( + TensorIteratorBase& iter, + const std::string& f, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + at::opmath_type scalar_val = 0, + std::tuple extra_args = std::make_tuple()) { + // TODO: much of preamble is common to both jitted_gpu_kernel and gpu_kernel + // Maybe it could be refactored? + for (int arg = 0; arg < iter.ntensors(); arg++) { + TORCH_INTERNAL_ASSERT( + iter.device(arg).is_cuda(), + "argument ", arg, ": expected a CUDA device but found ", iter.device(arg)); + } + + if (iter.numel() == 0) { + return; + } + + if (!iter.can_use_32bit_indexing()) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + jitted_gpu_kernel( + sub_iter, f, scalar_pos, scalar_val, extra_args); + } + + return; + } + + // Computes if dynamic casting is needed + // Dynamic casting is needed if an input's dtype differs from the common dtype + // or if the result dtype differs from the output's dtype + // Note: this is intentionally divergent from calling needs_dynamic_casting, + // which is more general and inspects a lambda to determine if dynamic + // casting is needed. + bool needs_dynamic_casting = false; + + // Checks output + const ScalarType return_scalar_type = c10::CppTypeToScalarType::value; + const auto dtype0 = iter.dtype(0); + if (dtype0 != return_scalar_type) { + needs_dynamic_casting = true; + } + + // Checks input(s) + const ScalarType inputs_scalar_type = c10::CppTypeToScalarType::value; + for (auto i = decltype(arity){1}; i < (arity + 1); ++i) { + const auto dtypei = iter.dtype(i); + if (dtypei != inputs_scalar_type) { + needs_dynamic_casting = true; + break; + } + } + if (scalar_pos == at::cuda::jit::BinaryFuncVariant::NoScalar) { + // NOTE: With `scalar_pos=NoScalar`,`scalar_val` is not used + // for computation in the generated code and hence we pass a dummy + // value of `0`. + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::NoScalar>( + iter, f, needs_dynamic_casting, /*scalar_val=*/scalar_val, extra_args); + } else if (scalar_pos == at::cuda::jit::BinaryFuncVariant::RhsScalar) { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::RhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + + } else { + jitted_gpu_kernel_impl< + /*name*/ name, + /*return_type=*/return_type, + /*f_inputs_type=*/f_inputs_type, + arity, + at::cuda::jit::BinaryFuncVariant::LhsScalar>( + iter, + f, + needs_dynamic_casting, + scalar_val, + extra_args); + } +} + +// TODO: support runtime state capture similar to `jitted_gpu_kernel`. +template +void opmath_jitted_gpu_kernel_with_scalars(TensorIteratorBase& iter, const std::string& f) { + TORCH_INTERNAL_ASSERT(iter.ntensors() == 3); + //currently jiterator only handles binary functions where both inputs are of the same type (f_inputs_type) + using opmath_t = at::opmath_type; + if (iter.is_cpu_scalar(1)) { + auto scalar_val = iter.scalar_value(1); + iter.remove_operand(1); + // TODO: When all kernels that use gpu_kernel_with_scalars are + // ported to structured, this device guard can be deleted. This + // works around incorrect device guard generation for pre-structured + // kernels device guards, but structured kernels do it right and + // we can assume the device is already set correctly + const OptionalDeviceGuard device_guard(iter.device(1)); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::LhsScalar, scalar_val); + } else if (iter.is_cpu_scalar(2)) { + auto scalar_val = iter.scalar_value(2); + iter.remove_operand(2); + jitted_gpu_kernel(iter, f, at::cuda::jit::BinaryFuncVariant::RhsScalar, scalar_val); + } else { + jitted_gpu_kernel(iter, f); + } +} + +} // namespace at::native + +#endif // AT_USE_JITERATOR() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..73a5ba0a479bb98cbd79e7b0312a6a778287a056 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/KernelUtils.cuh @@ -0,0 +1,411 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#if !(defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))) +#include +#endif + +// ROCm 6.3 is planned to have these functions, but until then here they are. +#if defined(USE_ROCM) +#include +#include +#include + +__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) { +#if (defined(__gfx942__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16) + typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2; + static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw)); + union { + __hip_bfloat162_raw bf162_raw; + vec_short2 vs2; + } u{static_cast<__hip_bfloat162_raw>(value)}; + u.vs2 = __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)address, u.vs2); + return static_cast<__hip_bfloat162>(u.bf162_raw); +#else + static_assert(sizeof(unsigned int) == sizeof(__hip_bfloat162_raw)); + union u_hold { + __hip_bfloat162_raw h2r; + unsigned int u32; + }; + u_hold old_val, new_val; + old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + do { + new_val.h2r = __hadd2(old_val.h2r, value); + } while (!__hip_atomic_compare_exchange_strong( + (unsigned int*)address, &old_val.u32, new_val.u32, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)); + return old_val.h2r; +#endif +} + +__device__ inline __half2 preview_unsafeAtomicAdd(__half2* address, __half2 value) { +#if (defined(__gfx942__)) && \ + __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2f16) + // The api expects an ext_vector_type of half + typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162; + static_assert(sizeof(vec_fp162) == sizeof(__half2_raw)); + union { + __half2_raw h2r; + vec_fp162 fp16; + } u {static_cast<__half2_raw>(value)}; + u.fp16 = __builtin_amdgcn_flat_atomic_fadd_v2f16((vec_fp162*)address, u.fp16); + return static_cast<__half2>(u.h2r); +#else + static_assert(sizeof(__half2_raw) == sizeof(unsigned int)); + union u_hold { + __half2_raw h2r; + unsigned int u32; + }; + u_hold old_val, new_val; + old_val.u32 = __hip_atomic_load((unsigned int*)address, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + do { + new_val.h2r = __hadd2(old_val.h2r, value); + } while (!__hip_atomic_compare_exchange_strong( + (unsigned int*)address, &old_val.u32, new_val.u32, + __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT)); + return old_val.h2r; +#endif +} +#define ATOMICADD preview_unsafeAtomicAdd +#define NATIVE_ZERO_BF16 __float2bfloat16(0.0f) +#else +#define ATOMICADD atomicAdd +#define NATIVE_ZERO_BF16 __int2bfloat16_rz(0) +#endif + +namespace at:: native { + +__device__ __forceinline__ size_t +idx(const size_t nc, + const size_t height, + const size_t width, + const size_t h, + const size_t w) { + return (nc * height + h) * width + w; +} + +// for channels-last +__device__ __forceinline__ size_t +idx_cl( + const size_t n, const size_t h, const size_t w, const size_t c, + const size_t height, const size_t width, const size_t channel +) { + return ((n * height + h) * width + w) * channel + c; +} + +// fastSpecializedAtomicAdd (and fastAtomicAdd) are an optimization +// that speed up half-precision atomics. The situation with half +// precision atomics is that we have a slow __half atomic, and +// a fast vectored __half2 atomic (this can be worth up to a 6x +// speedup, see https://github.com/pytorch/pytorch/pull/21879). +// We can convert a __half atomic into a __half2 atomic by simply +// pairing the __half with a zero entry on the left/right depending +// on alignment... but only if this wouldn't cause an out of bounds +// access! Thus, you must specify tensor and numel so we can check +// if you would be out-of-bounds and use a plain __half atomic if +// you would be. +template < + typename scalar_t, + typename index_t, + typename std::enable_if_t>* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)) + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned) + __half* target_addr = reinterpret_cast<__half*>(tensor + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (low_byte && index < (numel - 1)) { + __half2 value2; + value2.x = static_cast<__half>(value); + value2.y = __int2half_rz(0); + ATOMICADD(reinterpret_cast<__half2*>(target_addr), value2); + + } else if (!low_byte && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = static_cast<__half>(value); + ATOMICADD(reinterpret_cast<__half2*>(target_addr - 1), value2); + + } else { +#ifdef USE_ROCM + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, static_cast(value)); +#else + atomicAdd( + reinterpret_cast<__half*>(tensor) + index, static_cast<__half>(value)); +#endif + } +#endif +} + +template < + typename scalar_t, + typename index_t, + typename std::enable_if_t>* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)) + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // Accounts for the chance tensor falls on an odd 16 bit alignment (ie, not 32 bit aligned) + __nv_bfloat16* target_addr = reinterpret_cast<__nv_bfloat16*>(tensor + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__nv_bfloat162) == 0); + + if (low_byte && index < (numel - 1)) { + __nv_bfloat162 value2; + value2.x = *reinterpret_cast<__nv_bfloat16*>(&value); + value2.y = NATIVE_ZERO_BF16; + ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr), value2); + + } else if (!low_byte && index > 0) { + __nv_bfloat162 value2; + value2.x = NATIVE_ZERO_BF16; + value2.y = *reinterpret_cast<__nv_bfloat16*>(&value); + ATOMICADD(reinterpret_cast<__nv_bfloat162*>(target_addr - 1), value2); + + } else { +#ifdef USE_ROCM + gpuAtomicAddNoReturn( + reinterpret_cast(tensor) + index, static_cast(value)); +#else + atomicAdd( + reinterpret_cast<__nv_bfloat16*>(tensor) + index, *reinterpret_cast<__nv_bfloat16*>(&value)); +#endif + } +#endif +} + + +template < + typename scalar_t, + typename index_t, + typename std::enable_if_t && !std::is_same_v>* = + nullptr> +__device__ __forceinline__ void fastSpecializedAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value) { + gpuAtomicAddNoReturn(tensor + index, value); +} + +template +__device__ __forceinline__ void fastAtomicAdd( + scalar_t* tensor, + index_t index, + const index_t numel, + scalar_t value, + bool fast_atomics) { + if (fast_atomics) { + fastSpecializedAtomicAdd(tensor, index, numel, value); + } else { + gpuAtomicAddNoReturn(tensor + index, value); + } +} + + +#ifdef USE_ROCM +// This function implements a committed store. +// Upon returning, the store is committed to global memory. +// This is useful in avoiding the need for fences. +template +__device__ inline void cmtdStore(void* address, T value) { + int constexpr num_long_per_val = sizeof(value)/sizeof(long); + int constexpr num_int_per_val = sizeof(value)/sizeof(int); + int constexpr num_short_per_val = sizeof(value)/sizeof(short); + int constexpr num_char_per_val = sizeof(value)/sizeof(char); + union pnr { T v; + long l[num_long_per_val]; + int i[num_int_per_val]; + short s[num_short_per_val]; + char c[num_char_per_val]; } + _pnr = {.v = value }; + if constexpr (num_long_per_val*sizeof(long) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_int_per_val*sizeof(int) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_short_per_val*sizeof(short) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + else if constexpr (num_char_per_val*sizeof(char) == sizeof(value)) + for (int i=0; i(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); + __atomic_signal_fence(__ATOMIC_SEQ_CST); + asm volatile("s_waitcnt vmcnt(0)" ::: "memory"); + __atomic_signal_fence(__ATOMIC_SEQ_CST); +} +#endif + +#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)) +// This function implements warp-level opportunistic fastatomics +// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd. +// We identify all the threads within a warp that will perform an atomicAdd on the same destination +// address and perform the addition on the CU. Each warp elects a leader thread which does the +// atomicAdd to the destination address. +template +__device__ __forceinline__ void opportunistic_fastAtomicAdd( + scalar_t* self_ptr, + index_t index, + const index_t numel, + scalar_t value) { + + scalar_t* dst = self_ptr + index; + + //pack coalesced bf16 and fp16 + if constexpr (std::is_same::value || std::is_same::value) + { + typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2; + union ill { unsigned int i[2]; int64_t il; }; + ill iil_, ill_oneUpDst = {}; + iil_.il = (int64_t)dst; + ill_oneUpDst.i[0] = __builtin_amdgcn_mov_dpp(iil_.i[0], 0x130, 0xf, 0xf, 0); + ill_oneUpDst.i[1] = __builtin_amdgcn_mov_dpp(iil_.i[1], 0x130, 0xf, 0xf, 0); + union bfi {scalar_t bf; short s; } bfi_ = { .bf = value }; bfi bfi_oneUpVal; + + bfi_oneUpVal.s = __builtin_amdgcn_mov_dpp(bfi_.s, 0x130, 0xf, 0xf, 0); + auto oneUpVal = bfi_oneUpVal.bf; + + __half* target_addr = reinterpret_cast<__half*>(self_ptr + index); + bool low_byte = (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + bool canCombnUp = (bool)(__activemask()&(1<<(threadIdx.x+1))) && + (low_byte && index < (numel - 1)) && + (ill_oneUpDst.il - reinterpret_cast(dst) == sizeof(scalar_t)); + bool canCombnDn = (__builtin_amdgcn_mov_dpp(canCombnUp, 0x138, 0xf, 0xf, 0)); + + if (__lane_id()%2==0) + { + if (canCombnUp) { + typedef _Float16 __attribute__((ext_vector_type(2))) vec_fp162; + union bfvs { scalar_t bf[2]; vec_short2 vs2; vec_fp162 df16; }; + bfvs bfvs_ = {}; + bfvs_.bf[0] = value; + bfvs_.bf[1] = oneUpVal; + if constexpr (std::is_same::value) + __builtin_amdgcn_flat_atomic_fadd_v2bf16((vec_short2*)dst, bfvs_.vs2); + else + __builtin_amdgcn_flat_atomic_fadd_v2f16((__half2*)dst, bfvs_.df16); + return; + } + } + else + { + if (canCombnDn) + return; + } + } + + // not coalesced, so now let try to capture lane-matches... + + if (numel > 16 /*<-hueristic threshold*/ * 64 ) { + // well shucks, unlikely to capture same-dest atomics in a wave. + // fall back to direct fastAtomic... + fastAtomicAdd(self_ptr, index, numel, value, true); + return; + } + + // __activemask() -- finds the set of threads in the warp that are about to perform atomicAdd + // __match_any_sync() -- returns bit mask of the threads that have same dest addr + auto mask = __match_any_sync(__activemask(), (int64_t)dst); + + // select a leader thread + int leader = __ffsll(mask) - 1; + + scalar_t crnt_val = (scalar_t)0; + auto crnt_msk = mask >> (leader); + int crnt_idx = leader; + + // __shfl is limited in the dtypes it accepts + // That's why, we need these if/else to correctly do the addition on the CU + if constexpr(sizeof(scalar_t) <= sizeof(int)) { + union punner { int l; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l = __shfl(pnr.l ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + else if constexpr(sizeof(scalar_t) <= sizeof(long)) { + union punner { long l; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l = __shfl(pnr.l ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + else if constexpr(sizeof(scalar_t) <= sizeof(long long)) { + union punner { long long l; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l = __shfl(pnr.l ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + else { + union punner { long long l[2]; scalar_t s; }; + punner pnr = {}; + pnr.s = value; + while (crnt_msk != 0) { + if (crnt_msk & 1) { + punner add_val = {}; + add_val.l[0] = __shfl(pnr.l[0] ,crnt_idx); + add_val.l[1] = __shfl(pnr.l[1] ,crnt_idx); + crnt_val += add_val.s; + } + crnt_idx++; + crnt_msk = crnt_msk >> 1; + } + } + + + //Once the correct crnt_val is determined, only the leader thread does the update to the dest addr + if (__lane_id() == leader) { + fastAtomicAdd(self_ptr, index, numel, crnt_val, true); + } +} +#endif + +#undef ATOMICADD +#undef NATIVE_ZERO_BF16 + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Math.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Math.cuh new file mode 100644 index 0000000000000000000000000000000000000000..76971eeb80c5606b733cc1d983f229fb10ef998f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Math.cuh @@ -0,0 +1,3395 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { +// See note [Jiterator] +// TODO: elaborate in this comment on the structure of math.cuh +#if AT_USE_JITERATOR() + +const auto ndtri_string = jiterator_stringify( + /* + * This function is derived from the implementation of the digamma function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Evaluates polynomial of degree N: + * + * 2 N + * y = C + C x + C x +...+ C x + * 0 1 2 N + * + * Coefficients are stored in reverse order: + * + * coef[0] = C , ..., coef[N] = C . + * N 0 + */ + template + T polevl(const T x, const T A[], const int len) { + // NOTE: This `polevl` is different from other `polevl` + // implementation (in PyTorch) which expect the `len` to be + // `len(A) - 1` instead of `len(A)`. + T result = 0; + for (int i = 0; i < len; ++i) { + result = result * x + A[i]; + } + return result; + } + + /* + * This function is derived from the implementation of the i1e function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + * + * Computes the argument, x, for which the area under the Gaussian probability density function + * (integrated from minus infinity to x) is equal to y. + */ + template + T ndtri(T y0) { + + constexpr T zero = 0; + constexpr T one = 1; + + // Handles special cases + if (y0 == zero) { + return NEG_INFINITY; + } + if (y0 == one) { + return POS_INFINITY; + } + if (y0 < zero || y0 > one) { + return NAN; + } + + bool code = true; + T y = y0; + // Note: the constant 0.135... is equal to exp(-2) + if (y > one - T{0.13533528323661269189}) { + y = one - y; + code = false; + } + + if (y > T{0.13533528323661269189}) { + /* approximation for 0 <= |y - 0.5| <= 3/8 */ + static const T P0[5] = { + -5.99633501014107895267E1, + 9.80010754185999661536E1, + -5.66762857469070293439E1, + 1.39312609387279679503E1, + -1.23916583867381258016E0, + }; + + static const T Q0[9] = { + 1.00000000000000000000E0, + 1.95448858338141759834E0, + 4.67627912898881538453E0, + 8.63602421390890590575E1, + -2.25462687854119370527E2, + 2.00260212380060660359E2, + -8.20372256168333339912E1, + 1.59056225126211695515E1, + -1.18331621121330003142E0, + }; + + /* sqrt(2pi) */ + constexpr T s2pi = 2.50662827463100050242E0; + + y = y - T{0.5}; + const T y2 = y * y; + T x = y + y * (y2 * polevl(y2, P0, int{5}) / polevl(y2, Q0, int{9})); + return x * s2pi; + } + + T x = sqrt(T{-2.} * log(y)); + const T x0 = x - (log(x) / x); + + const T z = one / x; + T x1; + + /* y > exp(-32) = 1.2664165549e-14 */ + if (x < T{8.0}) { + /* Approximation for interval z = sqrt(-2 log y ) between 2 and 8 + * i.e., y between exp(-2) = .135 and exp(-32) = 1.27e-14. + */ + static const T P1[9] = { + 4.05544892305962419923E0, + 3.15251094599893866154E1, + 5.71628192246421288162E1, + 4.40805073893200834700E1, + 1.46849561928858024014E1, + 2.18663306850790267539E0, + -1.40256079171354495875E-1, + -3.50424626827848203418E-2, + -8.57456785154685413611E-4, + }; + + static const T Q1[9] = { + 1.00000000000000000000E0, + 1.57799883256466749731E1, + 4.53907635128879210584E1, + 4.13172038254672030440E1, + 1.50425385692907503408E1, + 2.50464946208309415979E0, + -1.42182922854787788574E-1, + -3.80806407691578277194E-2, + -9.33259480895457427372E-4, + }; + + x1 = z * polevl(z, P1, int{9}) / polevl(z, Q1, int{9}); + } else { + /* Approximation for interval z = sqrt(-2 log y ) between 8 and 64 + * i.e., y between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890. + */ + static const T P2[9] = { + 3.23774891776946035970E0, + 6.91522889068984211695E0, + 3.93881025292474443415E0, + 1.33303460815807542389E0, + 2.01485389549179081538E-1, + 1.23716634817820021358E-2, + 3.01581553508235416007E-4, + 2.65806974686737550832E-6, + 6.23974539184983293730E-9, + }; + + static const T Q2[9] = { + 1.00000000000000000000E0, + 6.02427039364742014255E0, + 3.67983563856160859403E0, + 1.37702099489081330271E0, + 2.16236993594496635890E-1, + 1.34204006088543189037E-2, + 3.28014464682127739104E-4, + 2.89247864745380683936E-6, + 6.79019408009981274425E-9, + }; + + x1 = z * polevl(z, P2, int{9}) / polevl(z, Q2, int{9}); + } + + x = x0 - x1; + return (!code) ? x : -x; + } +); // ndtri_string + +const auto log_ndtr_string = jiterator_stringify( + template + T log_ndtr(T x) { + constexpr T SQRT1_2{0.707106781186547524400844362104849039}; // 1/sqrt(2) + T t = x * SQRT1_2; + if (x < T{-1.0}) { + return log(erfcx(-t) / 2) - t * t; + } else { + return log1p(-erfc(t) / 2); + } + } +); // log_ndtr_string + +const auto gcd_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } +); // gcd_string + +const auto lcm_string = jiterator_stringify( + template + T gcd(const T a_in, const T b_in) { + T a = abs(a_in); + T b = abs(b_in); + + while (a != T{0}) { + T c = a; + a = b % a; + b = c; + } + + return b; + } + + template + T lcm(const T a, const T b) { + T g = gcd(a, b); + return (g == T{0}) ? T{0} : abs(a / g * b); + } +); // lcm_string + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +// [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma +const auto digamma_string = jiterator_stringify( + template + T digamma(T x) { + static constexpr double PI_f64 = 3.14159265358979323846; + + // Short-circuits if x is +/- 0 and returns -/+ ∞ per the C++ standard + if (x == 0) { + return copysign(POS_INFINITY, -x); + } + + T result = 0; + if (x < 0) { + // Short-circuits if x is a negative integer and returns NaN + // per the C++ standard + const bool x_is_integer = (x == trunc(x)); + if (x_is_integer) { + return NAN; + } + + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = modf(static_cast(x), &q); + result = - PI_f64 / tan(PI_f64 * r); + x = 1 - x; + } + + while (x < T{10}) { + result -= T{1} / x; + x += T{1}; + } + + if (x == T{10}) { + return result + T{2.25175258906672110764}; + } + + T y = 0; + if (x < T{1.0e17}) { + const T A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + + T z = T{1} / (x * x); + + T polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return log(x) - (T{0.5} / x) - y + result; + } +); // digamma_string + +/* + * This function is derived from the implementation of the zeta function in the Cephes Math Library. + * See note [3-Clause BSD License for the Cephes Math Library]. + */ +const auto zeta_string = jiterator_stringify( + template + T zeta(T x, T q) { + const T MACHEP{1.11022302462515654042E-16}; + constexpr T zero{0}; + constexpr T half{0.5}; + constexpr T one{1}; + static const T A[] = { + 12.0, + -720.0, + 30240.0, + -1209600.0, + 47900160.0, + -1.8924375803183791606e9, /*1.307674368e12/691*/ + 7.47242496e10, + -2.950130727918164224e12, /*1.067062284288e16/3617*/ + 1.1646782814350067249e14, /*5.109094217170944e18/43867*/ + -4.5979787224074726105e15, /*8.028576626982912e20/174611*/ + 1.8152105401943546773e17, /*1.5511210043330985984e23/854513*/ + -7.1661652561756670113e18 /*1.6938241367317436694528e27/236364091*/ + }; + + int i = 0; + T a, b, k, s, t, w; + + // Short-circuits x -> +infty + if (x == one) { + return POS_INFINITY; + } + + // Short-circuits x < 1 -> NaN + if (x < one) { + return NAN; + } + + // Short-circuits negative q integers map to +infty, + // negative q non-integers map to NaN + if (q <= zero) { + if (q == floor(q)) { + return POS_INFINITY; + } + if (x != floor(x)) { + return NAN; + } + } + + s = pow(q, -x); + a = q; + i = 0; + b = zero; + while ((i < 9) || (a <= T{9.0})) { + i += 1; + a += one; + b = pow(a, -x); + s += b; + if ((-MACHEP * s < b) && (b < MACHEP * s)) { + return s; + } + }; + + w = a; + s += b * w / (x - one); + s -= half * b; + a = one; + k = zero; + for (int i = 0; i < 12; i++) { + a *= x + k; + b /= w; + t = a * b / A[i]; + s = s + t; + t = fabs(t / s); + + if (t < MACHEP) { + return s; + } + + k += one; + a *= x + k; + b /= w; + k += one; + } + + return s; + } +); // zeta_string + +const auto trigamma_string = jiterator_stringify( + template + T trigamma(T x) { + const T PI{3.14159265358979323846}; + T sign = 1; + T result = 0; + + if (x < T{0.5}) { + sign = -1; + T sin_pi_x = sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + + for (int i = 0; i < 6; ++i) { + result += T{1} / (x * x); + x += 1; + } + + const T one{1}; + const T ixx = one / (x*x); + result += (one + one / (T{2}*x) + ixx * (one/T{6} - ixx * (one/T{30} - ixx * (one/T{42})))) / x; + return sign * result; +} +); // trigamma_string + +const auto lgamma_string = jiterator_stringify( + template + T lgamma_kernel(T a) { + return lgamma(a); + } +); // lgamma_string + +const auto polygamma_string = zeta_string + jiterator_stringify( + template + T polygamma(T x, int n) { + // already blocked if n <= 1 + const auto one = T{1}; + return ((n % 2) ? one : -one) * exp(lgamma(static_cast(n) + one)) * + zeta(static_cast(n + 1), x); + } +); // polygamma_string + +const auto exp2_string = jiterator_stringify( + template + T exp2_impl(T a) { + return exp2(a); + } + + namespace std { template class complex; } + template + std::complex exp2_impl(std::complex x) { + // There is no std::exp2 overload for complex, so instead + // use the identity 2^x = e^(ln(2) * x) + const auto ln_2 = static_cast(0.693147180559945309417232121458176); + return exp(ln_2 * x); + } + + template + T exp2_kernel(T a) { + return exp2_impl(a); + } +); // exp2_string + +const auto erfc_string = jiterator_stringify( + template + T erfc_kernel(T a) { + return erfc(a); + } +); // erfc_string + +const auto erfinv_string = jiterator_stringify( + template + T erfinv_kernel(T a) { + return erfinv(a); + } +); // erfinv_string + +const auto entr_string = jiterator_stringify( + template + T entr(T a) { + if (a != a) { + return a; + } + + if (a > 0) { + return -a * log(a); + } + + if (a == 0) { + return 0; + } + + return NEG_INFINITY; + } +); // entr_string + +// NOTE: `kaiser_window_string` depends on `i0_string` +// for its implementation. +const auto i0_string = jiterator_stringify( + template + T chbevl(T x, const T array[], const int len) { + + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i0(T _x) { + T x = fabs(_x); + + if (x <= T{8.0}) { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T A[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + T y = (x / T{2.0}) - T{2.0}; + return exp(x) * chbevl(y, A, int{30}); + } + + // Handles x > 8 case + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + const T B[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return (exp(x) * chbevl(T{32.0} / x - T{2.0}, B, int{25})) / sqrt(x); + } +); // i0_string + +const auto i1_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + template + T i1(T _x) { + const T x = fabs(_x); + + if (x <= T{8.0}) { + // Chebyshev coefficients for exp(-x) i1(x) in the internal [0, 8] + // lim(x->0){ exp(-x) i1(x) / x } = 1/2 + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const T y = x / T{2.0} - T{2.0}; + const T out = exp(x) * x * chbevl(y, coefficients, int{29}); + return (_x < T{0.0}) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval [8, infinity] + // lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi) + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + const T out = (exp(x) * chbevl(T{32.} / x - T{2.}, coefficients, int{25})) / sqrt(x); + return (_x < T{0.}) ? -out : out; + } +); // i1_string + +const auto i1e_string = jiterator_stringify( + template + T chbevl(const T x, const T array[], const int len) { + T b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (int i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = x * b1 - b2 + array[i]; + } + + return T{0.5} * (b0 - b2); + } + + // See double and float instantiations below + template + T i1e(T _x) { } + + // Double specialization (uses different coefficients than the float version) + template<> + double i1e(double _x) { + const double x = fabs(_x); + if (x <= double{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const double coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + const double y = x / double{2.} - double{2.}; + const double out = chbevl(y, coefficients, int{29}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const double coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + const double out = chbevl(double{32.} / x - double{2.}, coefficients, int{25}) / sqrt(x); + return (_x < double{0.}) ? -out : out; + } + + // Float specialization (uses different coefficients than the double version) + template<> + float i1e(float _x) { + const float x = fabsf(_x); + if (x <= float{8.}) { + // Chebyshev double coefficients for exp(-x) i1(x) in the interval [0,8]. + // Note: lim(x->0){ exp(-x) i1(x) / x } = 1/2. + static const float coefficients[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + const float y = x / float{2.} - float{2.}; + const float out = chbevl(y, coefficients, int{17}) * x; + return (_x < 0.) ? -out : out; + } + + // Chebyshev coefficients for exp(-x) sqrt(x) i1(x) + // in the inverted interval (8, infinity]. + // Note: lim(x->inf){ exp(-x) sqrt(x) i1(x) } = 1/sqrt(2pi). + // TODO: what's an "inverted interval"? Open on the left + // and closed on the right? + static const float coefficients[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + const float out = chbevl(float{32.} / x - float{2.}, coefficients, int{7}) / sqrt(x); + return (_x < float{0.}) ? -out : out; + } +); // i1e_string + +const auto kaiser_window_string = i0_string + jiterator_stringify( + template + T kaiser_window(T a, T inv_alpha, T beta, T inv_i0_beta) { + T x = a * inv_alpha - T{1}; + T y = max(T{0}, T{1} - x * x); + return i0(beta * sqrt(y)) * inv_i0_beta; + } +); // kaiser_window_string + +const auto sinc_string = jiterator_stringify( + template + T sinc(T a) { + if (a == T(0)) { + return T(1); + } + constexpr T pi = T(3.14159265358979323846L); + T product = pi * a; + return std::sin(product) / product; + } +); // sinc_string + +const auto erfcx_string = jiterator_stringify( + /* The next function is taken from http://ab-initio.mit.edu/faddeeva */ + + /* Copyright (c) 2012 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + /* erfcx(x) = exp(x^2) erfc(x) function, for real x, written by + Steven G. Johnson, October 2012. + + This function combines a few different ideas. + + First, for x > 50, it uses a continued-fraction expansion (same as + for the Faddeeva function, but with algebraic simplifications for z=i*x). + + Second, for 0 <= x <= 50, it uses Chebyshev polynomial approximations, + but with two twists: + + a) It maps x to y = 4 / (4+x) in [0,1]. This simple transformation, + inspired by a similar transformation in the octave-forge/specfun + erfcx by Soren Hauberg, results in much faster Chebyshev convergence + than other simple transformations I have examined. + + b) Instead of using a single Chebyshev polynomial for the entire + [0,1] y interval, we break the interval up into 100 equal + subintervals, with a switch/lookup table, and use much lower + degree Chebyshev polynomials in each subinterval. This greatly + improves performance in my tests. + + For x < 0, we use the relationship erfcx(-x) = 2 exp(x^2) - erfc(x), + with the usual checks for overflow etcetera. + + Performance-wise, it seems to be substantially faster than either + the SLATEC DERFC function [or an erfcx function derived there from] + or Cody's CALERF function (from netlib.org/specfun), while + retaining near machine precision in accuracy. + */ + + /* Given y100 = 100 * y, where y = 4 / (4 + x) for x >= 0, compute erfc(x). + + Uses a look-up table of 100 different Chebyshev polynomials + for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated + with the help of Maple and a little shell script. This allows + the Chebyshev polynomials to be of significantly lower degree (about 1/4) + compared to fitting the whole [0,1] interval with a single polynomial. + */ + + // TODO: review if this is computing in double when given a float input + template + T erfcx_y100(T y100) { + switch (static_cast(y100)) { + case 0: { + T t = 2*y100 - 1; + return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 1: { + T t = 2*y100 - 3; + return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 2: { + T t = 2*y100 - 5; + return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 3: { + T t = 2*y100 - 7; + return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t; + } + case 4: { + T t = 2*y100 - 9; + return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 5: { + T t = 2*y100 - 11; + return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t; + } + case 6: { + T t = 2*y100 - 13; + return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 7: { + T t = 2*y100 - 15; + return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t; + } + case 8: { + T t = 2*y100 - 17; + return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 9: { + T t = 2*y100 - 19; + return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t; + } + case 10: { + T t = 2*y100 - 21; + return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t; + } + case 11: { + T t = 2*y100 - 23; + return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t; + } + case 12: { + T t = 2*y100 - 25; + return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 13: { + T t = 2*y100 - 27; + return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 14: { + T t = 2*y100 - 29; + return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 15: { + T t = 2*y100 - 31; + return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 16: { + T t = 2*y100 - 33; + return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 17: { + T t = 2*y100 - 35; + return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 18: { + T t = 2*y100 - 37; + return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t; + } + case 19: { + T t = 2*y100 - 39; + return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 20: { + T t = 2*y100 - 41; + return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 21: { + T t = 2*y100 - 43; + return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 22: { + T t = 2*y100 - 45; + return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 23: { + T t = 2*y100 - 47; + return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t; + } + case 24: { + T t = 2*y100 - 49; + return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 25: { + T t = 2*y100 - 51; + return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 26: { + T t = 2*y100 - 53; + return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t; + } + case 27: { + T t = 2*y100 - 55; + return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t; + } + case 28: { + T t = 2*y100 - 57; + return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t; + } + case 29: { + T t = 2*y100 - 59; + return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t; + } + case 30: { + T t = 2*y100 - 61; + return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t; + } + case 31: { + T t = 2*y100 - 63; + return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 32: { + T t = 2*y100 - 65; + return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 33: { + T t = 2*y100 - 67; + return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 34: { + T t = 2*y100 - 69; + return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t; + } + case 35: { + T t = 2*y100 - 71; + return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 36: { + T t = 2*y100 - 73; + return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 37: { + T t = 2*y100 - 75; + return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 38: { + T t = 2*y100 - 77; + return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 39: { + T t = 2*y100 - 79; + return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 40: { + T t = 2*y100 - 81; + return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 41: { + T t = 2*y100 - 83; + return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 42: { + T t = 2*y100 - 85; + return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 43: { + T t = 2*y100 - 87; + return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 44: { + T t = 2*y100 - 89; + return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t; + } + case 45: { + T t = 2*y100 - 91; + return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 46: { + T t = 2*y100 - 93; + return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t; + } + case 47: { + T t = 2*y100 - 95; + return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 48: { + T t = 2*y100 - 97; + return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t; + } + case 49: { + T t = 2*y100 - 99; + return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t; + } + case 50: { + T t = 2*y100 - 101; + return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t; + } + case 51: { + T t = 2*y100 - 103; + return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 52: { + T t = 2*y100 - 105; + return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 53: { + T t = 2*y100 - 107; + return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t; + } + case 54: { + T t = 2*y100 - 109; + return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 55: { + T t = 2*y100 - 111; + return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 56: { + T t = 2*y100 - 113; + return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 57: { + T t = 2*y100 - 115; + return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 58: { + T t = 2*y100 - 117; + return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 59: { + T t = 2*y100 - 119; + return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 60: { + T t = 2*y100 - 121; + return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 61: { + T t = 2*y100 - 123; + return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 62: { + T t = 2*y100 - 125; + return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 63: { + T t = 2*y100 - 127; + return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 64: { + T t = 2*y100 - 129; + return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t; + } + case 65: { + T t = 2*y100 - 131; + return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 66: { + T t = 2*y100 - 133; + return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 67: { + T t = 2*y100 - 135; + return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t; + } + case 68: { + T t = 2*y100 - 137; + return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t; + } + case 69: { + T t = 2*y100 - 139; + return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t; + } + case 70: { + T t = 2*y100 - 141; + return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t; + } + case 71: { + T t = 2*y100 - 143; + return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t; + } + case 72: { + T t = 2*y100 - 145; + return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t; + } + case 73: { + T t = 2*y100 - 147; + return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t; + } + case 74: { + T t = 2*y100 - 149; + return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 75: { + T t = 2*y100 - 151; + return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 76: { + T t = 2*y100 - 153; + return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 77: { + T t = 2*y100 - 155; + return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 78: { + T t = 2*y100 - 157; + return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 79: { + T t = 2*y100 - 159; + return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 80: { + T t = 2*y100 - 161; + return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 81: { + T t = 2*y100 - 163; + return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 82: { + T t = 2*y100 - 165; + return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 83: { + T t = 2*y100 - 167; + return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 84: { + T t = 2*y100 - 169; + return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 85: { + T t = 2*y100 - 171; + return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t; + } + case 86: { + T t = 2*y100 - 173; + return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 87: { + T t = 2*y100 - 175; + return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 88: { + T t = 2*y100 - 177; + return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t; + } + case 89: { + T t = 2*y100 - 179; + return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 90: { + T t = 2*y100 - 181; + return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t; + } + case 91: { + T t = 2*y100 - 183; + return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 92: { + T t = 2*y100 - 185; + return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t; + } + case 93: { + T t = 2*y100 - 187; + return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 94: { + T t = 2*y100 - 189; + return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 95: { + T t = 2*y100 - 191; + return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t; + } + case 96: { + T t = 2*y100 - 193; + return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t; + } + case 97: { + T t = 2*y100 - 195; + return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t; + } + case 98: { + T t = 2*y100 - 197; + return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t; + } + case 99: { + T t = 2*y100 - 199; + return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t; + } + } + + // we only get here if y = 1, i.e. |x| < 4*eps, in which case + // erfcx is within 1e-15 of 1.. + return 1.; + } + + template + T erfcx(T x) { + // Short-circuits on NaN (returning NaN) + if (x != x) { + return x; + } + + if (x >= 0) { + if (x > T{50}) { // continued-fraction expansion is faster + const T ispi = 0.56418958354775628694807945156; // 1 / sqrt(pi) + + if (x > T{5e7}) { // 1-term expansion, important to avoid overflow + return ispi / x; + } + + /* 5-term expansion (rely on compiler for CSE), simplified from: + ispi / (x+0.5/(x+1/(x+1.5/(x+2/x)))) */ + return ispi * ((x*x) * (x*x+T{4.5}) + T{2}) / (x * ((x*x) * (x*x+T{5}) + T{3.75})); + } + + // x >= 0 x <= 50 + return erfcx_y100(T{400} / (T{4} + x)); + } + + // x < 0 + if (x < T{-26.7}) { + return POS_INFINITY; + } else if (x < T{-6.1}) { + return T{2} * exp(x * x); + } + + // x < 0 and x >= -6.1 + return T{2} * exp(x * x) - erfcx_y100(T{400} / (T{4} - x)); + } +); // erfcx_string + +const auto airy_ai_string = jiterator_stringify( + template + T airy_ai_forward(T x) { + static const T AN[] = { + +3.46538101525629032477e-01, + +1.20075952739645805542e+01, + +7.62796053615234516538e+01, + +1.68089224934630576269e+02, + +1.59756391350164413639e+02, + +7.05360906840444183113e+01, + +1.40264691163389668864e+01, + +9.99999999999999995305e-01, + }; + + static const T AD[] = { + +5.67594532638770212846e-01, + +1.47562562584847203173e+01, + +8.45138970141474626562e+01, + +1.77318088145400459522e+02, + +1.64234692871529701831e+02, + +7.14778400825575695274e+01, + +1.40959135607834029598e+01, + +1.00000000000000000470e+00, + }; + + static const T AFN[] = { + -1.31696323418331795333e-01, + -6.26456544431912369773e-01, + -6.93158036036933542233e-01, + -2.79779981545119124951e-01, + -4.91900132609500318020e-02, + -4.06265923594885404393e-03, + -1.59276496239262096340e-04, + -2.77649108155232920844e-06, + -1.67787698489114633780e-08, + }; + + static const T AFD[] = { + +1.33560420706553243746e+01, + +3.26825032795224613948e+01, + +2.67367040941499554804e+01, + +9.18707402907259625840e+00, + +1.47529146771666414581e+00, + +1.15687173795188044134e-01, + +4.40291641615211203805e-03, + +7.54720348287414296618e-05, + +4.51850092970580378464e-07, + }; + + static const T AGN[] = { + +1.97339932091685679179e-02, + +3.91103029615688277255e-01, + +1.06579897599595591108e+00, + +9.39169229816650230044e-01, + +3.51465656105547619242e-01, + +6.33888919628925490927e-02, + +5.85804113048388458567e-03, + +2.82851600836737019778e-04, + +6.98793669997260967291e-06, + +8.11789239554389293311e-08, + +3.41551784765923618484e-10, + }; + + static const T AGD[] = { + +9.30892908077441974853e+00, + +1.98352928718312140417e+01, + +1.55646628932864612953e+01, + +5.47686069422975497931e+00, + +9.54293611618961883998e-01, + +8.64580826352392193095e-02, + +4.12656523824222607191e-03, + +1.01259085116509135510e-04, + +1.17166733214413521882e-06, + +4.91834570062930015649e-09, + }; + + int domain_flag = 0; + + T ai; + + if (isinf(x)) { + return NAN; + } + + if (x > T(103.892)) { + return T(0.0); + } + + T f; + T g; + T k; + + if (x < T(-2.09)) { + T z = T(1.0) / (T(-2.0) * x * sqrt(-x) / T(3.0)); + + T afn = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afn = afn * (z * z) + AFN[index]; + } + + T afd = 0.0; + + for (uint8_t index = 0; index <= 8; index++) { + afd = afd * (z * z) + AFD[index]; + } + + T agn = 0.0; + + for (uint8_t index = 0; index <= 10 + 0; index++) { + agn = agn * (z * z) + AGN[index]; + } + + T agd = 0.0; + + for (uint8_t index = 0; index <= 10 - 1; index++) { + agd = agd * (z * z) + AGD[index]; + } + + T t = T(-2.0) * x * sqrt(-x) / T(3.0) + T(0.25) * T(3.14159265358979323846); + + return T(5.64189583547756286948e-01) / sqrt(sqrt(-x)) * (sin(t) * (T(1.0) + z * z * afn / afd) - cos(t) * (z * agn / agd)); + } + + if (x >= T(2.09)) { + domain_flag = 5; + + T zeta = T(2.0) * x * sqrt(x) / T(3.0); + + T an = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + an = an * (T(1.0) / zeta) + AN[index]; + } + + T ad = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + ad = ad * (T(1.0) / zeta) + AD[index]; + } + + ai = T(5.64189583547756286948e-01) * (an / ad) / (T(2.0) * sqrt(sqrt(x)) * exp(zeta)); + + if (x > T(8.3203353)) { + return ai; + } + } + + f = 1.0; + g = x; + k = 1.0; + + T m = 1.0; + T n = x; + T t = 1.0; + T z = x * x * x; + + while (t > T(1.11022302462515654042e-16)) { + m *= z; + k += T(1.0); + m /= k; + n *= z; + k += T(1.0); + n /= k; + m /= k; + f += m; + k += T(1.0); + n /= k; + g += n; + + t = abs(m / f); + } + + if ((domain_flag & 1) == 0) { + return T(0.355028053887817239260) * f - T(0.258819403792806798405) * g; + } + + return ai; + } // T airy_ai(T x) +); // airy_ai_string + +const auto bessel_j0_string = jiterator_stringify( + template + T bessel_j0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T RP[] = { + -4.79443220978201773821e+09, + +1.95617491946556577543e+12, + -2.49248344360967716204e+14, + +9.70862251047306323952e+15, + }; + + static const T RQ[] = { + +4.99563147152651017219e+02, + +1.73785401676374683123e+05, + +4.84409658339962045305e+07, + +1.11855537045356834862e+10, + +2.11277520115489217587e+12, + +3.10518229857422583814e+14, + +3.18121955943204943306e+16, + +1.71086294081043136091e+18, + }; + + if (x < T(0)) { + x = -x; + } + + if (x <= T(5.0)) { + if (x < T(0.00001)) { + return T(1.0) - x * x / T(4.0); + } + + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return (x * x - T(5.78318596294678452118e+00)) * (x * x - T(3.04712623436620863991e+01)) * rp / rq; + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(0.785398163397448309615660845819875721)) - T(5.0) / x * (qp / qq) * sin(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j0_forward(T x) +); // bessel_j0_string + +const auto bessel_y0_string = bessel_j0_string + jiterator_stringify( + template + T bessel_y0_forward(T x) { + static const T PP[] = { + +7.96936729297347051624e-04, + +8.28352392107440799803e-02, + +1.23953371646414299388e+00, + +5.44725003058768775090e+00, + +8.74716500199817011941e+00, + +5.30324038235394892183e+00, + +9.99999999999999997821e-01, + }; + + static const T PQ[] = { + +9.24408810558863637013e-04, + +8.56288474354474431428e-02, + +1.25352743901058953537e+00, + +5.47097740330417105182e+00, + +8.76190883237069594232e+00, + +5.30605288235394617618e+00, + +1.00000000000000000218e+00, + }; + + static const T QP[] = { + -1.13663838898469149931e-02, + -1.28252718670509318512e+00, + -1.95539544257735972385e+01, + -9.32060152123768231369e+01, + -1.77681167980488050595e+02, + -1.47077505154951170175e+02, + -5.14105326766599330220e+01, + -6.05014350600728481186e+00, + }; + + static const T QQ[] = { + +6.43178256118178023184e+01, + +8.56430025976980587198e+02, + +3.88240183605401609683e+03, + +7.24046774195652478189e+03, + +5.93072701187316984827e+03, + +2.06209331660327847417e+03, + +2.42005740240291393179e+02, + }; + + static const T YP[] = { + +1.55924367855235737965e+04, + -1.46639295903971606143e+07, + +5.43526477051876500413e+09, + -9.82136065717911466409e+11, + +8.75906394395366999549e+13, + -3.46628303384729719441e+15, + +4.42733268572569800351e+16, + -1.84950800436986690637e+16, + }; + + static const T YQ[] = { + +1.04128353664259848412e+03, + +6.26107330137134956842e+05, + +2.68919633393814121987e+08, + +8.64002487103935000337e+10, + +2.02979612750105546709e+13, + +3.17157752842975028269e+15, + +2.50596256172653059228e+17, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x < T(0.0)) { + NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return yp / yq + (T(0.636619772367581343075535053490057448) * log(x) * bessel_j0_forward(x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(25.0) / (x * x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(25.0) / (x * x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(25.0) / (x * x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(25.0) / (x * x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(0.785398163397448309615660845819875721)) + T(5.0) / x * (qp / qq) * cos(x - T(0.785398163397448309615660845819875721))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y0_forward(T x) +); // bessel_y0_string + +const auto bessel_j1_string = jiterator_stringify( + template + T bessel_j1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T RP[] = { + -8.99971225705559398224e+08, + +4.52228297998194034323e+11, + -7.27494245221818276015e+13, + +3.68295732863852883286e+15, + }; + + static const T RQ[] = { + +6.20836478118054335476e+02, + +2.56987256757748830383e+05, + +8.35146791431949253037e+07, + +2.21511595479792499675e+10, + +4.74914122079991414898e+12, + +7.84369607876235854894e+14, + +8.95222336184627338078e+16, + +5.32278620332680085395e+18, + }; + + if (x < T(0.0)) { + return -bessel_j1_forward(-x); + } + + if (x <= T(5.0)) { + T rp = 0.0; + + for (uint8_t index = 0; index <= 3; index++) { + rp = rp * (x * x) + RP[index]; + } + + T rq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + rq = rq * (x * x) + RQ[index]; + } + + return rp / rq * x * (x * x - T(1.46819706421238932572e+01)) * (x * x - T(4.92184563216946036703e+01)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * cos(x - T(2.356194490192344928846982537459627163)) - T(5.0) / x * (qp / qq) * sin(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_j1_forward(T x) +); // bessel_j1_string + +const auto bessel_y1_string = bessel_j1_string + jiterator_stringify( + template + T bessel_y1_forward(T x) { + static const T PP[] = { + +7.62125616208173112003e-04, + +7.31397056940917570436e-02, + +1.12719608129684925192e+00, + +5.11207951146807644818e+00, + +8.42404590141772420927e+00, + +5.21451598682361504063e+00, + +1.00000000000000000254e+00, + }; + + static const T PQ[] = { + +5.71323128072548699714e-04, + +6.88455908754495404082e-02, + +1.10514232634061696926e+00, + +5.07386386128601488557e+00, + +8.39985554327604159757e+00, + +5.20982848682361821619e+00, + +9.99999999999999997461e-01, + }; + + static const T QP[] = { + +5.10862594750176621635e-02, + +4.98213872951233449420e+00, + +7.58238284132545283818e+01, + +3.66779609360150777800e+02, + +7.10856304998926107277e+02, + +5.97489612400613639965e+02, + +2.11688757100572135698e+02, + +2.52070205858023719784e+01, + }; + + static const T QQ[] = { + +7.42373277035675149943e+01, + +1.05644886038262816351e+03, + +4.98641058337653607651e+03, + +9.56231892404756170795e+03, + +7.99704160447350683650e+03, + +2.82619278517639096600e+03, + +3.36093607810698293419e+02, + }; + + static const T YP[] = { + +1.26320474790178026440e+09, + -6.47355876379160291031e+11, + +1.14509511541823727583e+14, + -8.12770255501325109621e+15, + +2.02439475713594898196e+17, + -7.78877196265950026825e+17, + }; + + static const T YQ[] = { + +5.94301592346128195359e+02, + +2.35564092943068577943e+05, + +7.34811944459721705660e+07, + +1.87601316108706159478e+10, + +3.88231277496238566008e+12, + +6.20557727146953693363e+14, + +6.87141087355300489866e+16, + +3.97270608116560655612e+18, + }; + + if (x <= T(5.0)) { + if (x == T(0.0)) { + return NEG_INFINITY; + } + + if (x <= T(0.0)) { + return NAN; + } + + T yp = 0.0; + + for (uint8_t index = 0; index <= 5; index++) { + yp = yp * (x * x) + YP[index]; + } + + T yq = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + yq = yq * (x * x) + YQ[index]; + } + + return x * (yp / yq) + (T(0.636619772367581343075535053490057448) * (bessel_j1_forward(x) * log(x) - T(1.0) / x)); + } + + T pp = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pp = pp * (T(5.0) / x * (T(5.0) / x)) + PP[index]; + } + + T pq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + pq = pq * (T(5.0) / x * (T(5.0) / x)) + PQ[index]; + } + + T qp = 0.0; + + for (uint8_t index = 0; index <= 7; index++) { + qp = qp * (T(5.0) / x * (T(5.0) / x)) + QP[index]; + } + + T qq = 0.0; + + for (uint8_t index = 0; index <= 6; index++) { + qq = qq * (T(5.0) / x * (T(5.0) / x)) + QQ[index]; + } + + return (pp / pq * sin(x - T(2.356194490192344928846982537459627163)) + T(5.0) / x * (qp / qq) * cos(x - T(2.356194490192344928846982537459627163))) * T(0.797884560802865355879892119868763737) / sqrt(x); + } // bessel_y1_forward(T x) +); // bessel_y1_string + +const auto chebyshev_polynomial_t_string = jiterator_stringify( + template + T chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x) < T(1.0))) { + return cos(n * acos(x)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T chebyshev_polynomial_t_forward(T x, T n) { + return chebyshev_polynomial_t_forward(x, static_cast(n)); + } // chebyshev_polynomial_t_forward(T x, T n) +); // chebyshev_polynomial_t_string + +const auto chebyshev_polynomial_u_string = jiterator_stringify( + template + T chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x)) != T(0.0)) { + return sin((n + 1) * acos(x)) / sin(acos(x)); + } + + return (n + 1) * cos((n + 1) * acos(x)) / x; + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + T p = T(1.0); + T q = x + x; + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T chebyshev_polynomial_u_forward(T x, T n) { + return chebyshev_polynomial_u_forward(x, static_cast(n)); + } // chebyshev_polynomial_u_forward(T x, T n) +); // chebyshev_polynomial_u_string + +const auto chebyshev_polynomial_v_string = jiterator_stringify( + template + T chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return T(1.0); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (sin(acos(x) / T(2.0)) != T(1.0)) { + return cos((n + T(0.5)) * acos(x)) / cos(acos(x) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T chebyshev_polynomial_v_forward(T x, T n) { + return chebyshev_polynomial_v_forward(x, static_cast(n)); + } // chebyshev_polynomial_v_forward(T x, T n) +); // chebyshev_polynomial_v_string + +const auto chebyshev_polynomial_w_string = jiterator_stringify( + template + T chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 8) && (abs(x) < T(1.0))) { + if (cos(acos(x) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x)) / sin(acos(x) / T(2.0)); + } + + if (x > T(0.0)) { + return n + n + 1; + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x + T(1.0); + } + + T p = T(1.0); + T q = x + x + T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x) * q - p; + p = q; + q = r; + } + + return r; + } // chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T chebyshev_polynomial_w_forward(T x, T n) { + return chebyshev_polynomial_w_forward(x, static_cast(n)); + } // chebyshev_polynomial_w_forward(T x, T n) +); // chebyshev_polynomial_w_string + +const auto hermite_polynomial_h_string = jiterator_stringify( + template + unsigned short getHermitianLimit() { + if (sizeof(T) <= sizeof(float)) { + return 128; + } else if (sizeof(T) <= sizeof(double)) { + return 512; + } else { + return 1024; + } + } + + template + T hermite_polynomial_h_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x; + } + + if (n > getHermitianLimit()) { + return NAN; + } + + T p = T(1.0); + T q = x + x; + T r = T(0.0); + + for (int64_t k = 2; k < n + n; k += 2) { + r = (x + x) * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_h_forward(T x, int64_t n) + + template + T hermite_polynomial_h_forward(T x, T n) { + return hermite_polynomial_h_forward(x, static_cast(n)); + } // hermite_polynomial_h_forward(T x, T n) +); // hermite_polynomial_h_string + +const auto hermite_polynomial_he_string = jiterator_stringify( + template + unsigned short getHermitianLimit() { + if (sizeof(T) <= sizeof(float)) { + return 128; + } else if (sizeof(T) <= sizeof(double)) { + return 512; + } else { + return 1024; + } + } + + template + T hermite_polynomial_he_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + if (n > getHermitianLimit()) { + return NAN; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; k < n; k++) { + r = x * q - k * p; + p = q; + q = r; + } + + return r; + } // hermite_polynomial_he_forward(T x, int64_t n) + + template + T hermite_polynomial_he_forward(T x, T n) { + return hermite_polynomial_he_forward(x, static_cast(n)); + } // hermite_polynomial_he_forward(T x, T n) +); // hermite_polynomial_he_string + +const auto laguerre_polynomial_l_string = jiterator_stringify( + template + T laguerre_polynomial_l_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(0.0)) { + return T(1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return T(1.0) - x; + } + + T p = T(1.0); + T q = T(1.0) - x; + T r; + + for (int64_t k = 1; (k < n) && !isnan(q); k++) { + r = (((k + k) + (T(1.0) - x)) * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // laguerre_polynomial_l_forward(T x, int64_t n) + + template + T laguerre_polynomial_l_forward(T x, T n) { + return laguerre_polynomial_l_forward(x, static_cast(n)); + } // laguerre_polynomial_l_forward(T x, T n) +); // laguerre_polynomial_l_string + +const auto legendre_polynomial_p_string = jiterator_stringify( + template + T legendre_polynomial_p_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (abs(x) == T(1.0)) { + if (x > T(0.0) || n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x; + } + + T p = T(1.0); + T q = x; + T r; + + for (int64_t k = 1; (k < n) && !isnan(q); k++) { + r = ((k + k + 1) * x * q - k * p) / (k + 1); + p = q; + q = r; + } + + return r; + } // legendre_polynomial_p_forward(T x, int64_t n) + + template + T legendre_polynomial_p_forward(T x, T n) { + return legendre_polynomial_p_forward(x, static_cast(n)); + } // legendre_polynomial_p_forward(T x, T n) +); // legendre_polynomial_p_string + +const auto modified_bessel_i0_string = jiterator_stringify( + template + T modified_bessel_i0_forward(T x) { + static const T A[] = { + -4.41534164647933937950e-18, + +3.33079451882223809783e-17, + -2.43127984654795469359e-16, + +1.71539128555513303061e-15, + -1.16853328779934516808e-14, + +7.67618549860493561688e-14, + -4.85644678311192946090e-13, + +2.95505266312963983461e-12, + -1.72682629144155570723e-11, + +9.67580903537323691224e-11, + -5.18979560163526290666e-10, + +2.65982372468238665035e-09, + -1.30002500998624804212e-08, + +6.04699502254191894932e-08, + -2.67079385394061173391e-07, + +1.11738753912010371815e-06, + -4.41673835845875056359e-06, + +1.64484480707288970893e-05, + -5.75419501008210370398e-05, + +1.88502885095841655729e-04, + -5.76375574538582365885e-04, + +1.63947561694133579842e-03, + -4.32430999505057594430e-03, + +1.05464603945949983183e-02, + -2.37374148058994688156e-02, + +4.93052842396707084878e-02, + -9.49010970480476444210e-02, + +1.71620901522208775349e-01, + -3.04682672343198398683e-01, + +6.76795274409476084995e-01, + }; + + static const T B[] = { + -7.23318048787475395456e-18, + -4.83050448594418207126e-18, + +4.46562142029675999901e-17, + +3.46122286769746109310e-17, + -2.82762398051658348494e-16, + -3.42548561967721913462e-16, + +1.77256013305652638360e-15, + +3.81168066935262242075e-15, + -9.55484669882830764870e-15, + -4.15056934728722208663e-14, + +1.54008621752140982691e-14, + +3.85277838274214270114e-13, + +7.18012445138366623367e-13, + -1.79417853150680611778e-12, + -1.32158118404477131188e-11, + -3.14991652796324136454e-11, + +1.18891471078464383424e-11, + +4.94060238822496958910e-10, + +3.39623202570838634515e-09, + +2.26666899049817806459e-08, + +2.04891858946906374183e-07, + +2.89137052083475648297e-06, + +6.88975834691682398426e-05, + +3.36911647825569408990e-03, + +8.04490411014108831608e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 30; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + return exp(abs(x)) * (T(0.5) * (a - p)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i0_forward(T x) +); // modified_bessel_i0_string + +const auto modified_bessel_i1_string = jiterator_stringify( + template + T modified_bessel_i1_forward(T x) { + static const T A[] = { + +2.77791411276104639959e-18, + -2.11142121435816608115e-17, + +1.55363195773620046921e-16, + -1.10559694773538630805e-15, + +7.60068429473540693410e-15, + -5.04218550472791168711e-14, + +3.22379336594557470981e-13, + -1.98397439776494371520e-12, + +1.17361862988909016308e-11, + -6.66348972350202774223e-11, + +3.62559028155211703701e-10, + -1.88724975172282928790e-09, + +9.38153738649577178388e-09, + -4.44505912879632808065e-08, + +2.00329475355213526229e-07, + -8.56872026469545474066e-07, + +3.47025130813767847674e-06, + -1.32731636560394358279e-05, + +4.78156510755005422638e-05, + -1.61760815825896745588e-04, + +5.12285956168575772895e-04, + -1.51357245063125314899e-03, + +4.15642294431288815669e-03, + -1.05640848946261981558e-02, + +2.47264490306265168283e-02, + -5.29459812080949914269e-02, + +1.02643658689847095384e-01, + -1.76416518357834055153e-01, + +2.52587186443633654823e-01, + }; + + static const T B[] = { + +7.51729631084210481353e-18, + +4.41434832307170791151e-18, + -4.65030536848935832153e-17, + -3.20952592199342395980e-17, + +2.96262899764595013876e-16, + +3.30820231092092828324e-16, + -1.88035477551078244854e-15, + -3.81440307243700780478e-15, + +1.04202769841288027642e-14, + +4.27244001671195135429e-14, + -2.10154184277266431302e-14, + -4.08355111109219731823e-13, + -7.19855177624590851209e-13, + +2.03562854414708950722e-12, + +1.41258074366137813316e-11, + +3.25260358301548823856e-11, + -1.89749581235054123450e-11, + -5.58974346219658380687e-10, + -3.83538038596423702205e-09, + -2.63146884688951950684e-08, + -2.51223623787020892529e-07, + -3.88256480887769039346e-06, + -1.10588938762623716291e-04, + -9.76109749136146840777e-03, + +7.78576235018280120474e-01, + }; + + T p; + T q = 0.0; + + if (abs(x) <= T(8.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 29; index++) { + p = q; + q = a; + a = ((abs(x) / T(2.0)) - T(2.0)) * q - p + A[index]; + } + + if (x < T(0.0)) { + return -(T(0.5) * (a - p) * abs(x) * exp(abs(x))); + } + + return T(0.5) * (a - p) * abs(x) * exp(abs(x)); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(32.0) / abs(x) - T(2.0)) * q - p + B[index]; + } + + if (x < T(0.0)) { + return -(exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x))); + } + + return exp(abs(x)) * (T(0.5) * (b - p)) / sqrt(abs(x)); + } // modified_bessel_i1_forward(T x) +); // modified_bessel_i1_string + +const auto modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return T(0.5) * (a - p) - log(0.5 * x) * modified_bessel_i0_forward(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k0_forward(T x) +); // modified_bessel_k0_string + +const auto scaled_modified_bessel_k0_string = modified_bessel_i0_string + jiterator_stringify( + template + T scaled_modified_bessel_k0_forward(T x) { + static const T A[] = { + +1.37446543561352307156e-16, + +4.25981614279661018399e-14, + +1.03496952576338420167e-11, + +1.90451637722020886025e-09, + +2.53479107902614945675e-07, + +2.28621210311945178607e-05, + +1.26461541144692592338e-03, + +3.59799365153615016266e-02, + +3.44289899924628486886e-01, + -5.35327393233902768720e-01, + }; + + static const T B[] = { + +5.30043377268626276149e-18, + -1.64758043015242134646e-17, + +5.21039150503902756861e-17, + -1.67823109680541210385e-16, + +5.51205597852431940784e-16, + -1.84859337734377901440e-15, + +6.34007647740507060557e-15, + -2.22751332699166985548e-14, + +8.03289077536357521100e-14, + -2.98009692317273043925e-13, + +1.14034058820847496303e-12, + -4.51459788337394416547e-12, + +1.85594911495471785253e-11, + -7.95748924447710747776e-11, + +3.57739728140030116597e-10, + -1.69753450938905987466e-09, + +8.57403401741422608519e-09, + -4.66048989768794782956e-08, + +2.76681363944501510342e-07, + -1.83175552271911948767e-06, + +1.39498137188764993662e-05, + -1.28495495816278026384e-04, + +1.56988388573005337491e-03, + -3.14481013119645005427e-02, + +2.44030308206595545468e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 10; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (T(0.5) * (a - p) - log(T(0.5) * x) * modified_bessel_i0_forward(x)) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return T(0.5) * (b - p) / sqrt(x); + } // T scaled_modified_bessel_k0_forward(T x) +); // scaled_modified_bessel_k0_string + +const auto modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x; + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return exp(-x) * (T(0.5) * (b - p)) / sqrt(x); + } // modified_bessel_k1_forward(T x) +); // modified_bessel_k1_string + +const auto scaled_modified_bessel_k1_string = modified_bessel_i1_string + jiterator_stringify( + template + T scaled_modified_bessel_k1_forward(T x) { + static const T A[] = { + -7.02386347938628759343e-18, + -2.42744985051936593393e-15, + -6.66690169419932900609e-13, + -1.41148839263352776110e-10, + -2.21338763073472585583e-08, + -2.43340614156596823496e-06, + -1.73028895751305206302e-04, + -6.97572385963986435018e-03, + -1.22611180822657148235e-01, + -3.53155960776544875667e-01, + +1.52530022733894777053e+00, + }; + + static const T B[] = { + -5.75674448366501715755e-18, + +1.79405087314755922667e-17, + -5.68946255844285935196e-17, + +1.83809354436663880070e-16, + -6.05704724837331885336e-16, + +2.03870316562433424052e-15, + -7.01983709041831346144e-15, + +2.47715442448130437068e-14, + -8.97670518232499435011e-14, + +3.34841966607842919884e-13, + -1.28917396095102890680e-12, + +5.13963967348173025100e-12, + -2.12996783842756842877e-11, + +9.21831518760500529508e-11, + -4.19035475934189648750e-10, + +2.01504975519703286596e-09, + -1.03457624656780970260e-08, + +5.74108412545004946722e-08, + -3.50196060308781257119e-07, + +2.40648494783721712015e-06, + -1.93619797416608296024e-05, + +1.95215518471351631108e-04, + -2.85781685962277938680e-03, + +1.03923736576817238437e-01, + +2.72062619048444266945e+00, + }; + + if (x == T(0.0)) { + return INFINITY; + } + + if (x < T(0.0)) { + return NAN; + } + + T p; + T q = 0.0; + + if (x <= T(2.0)) { + T a = A[0]; + + for (uint8_t index = 1; index < 11; index++) { + p = q; + q = a; + a = (x * x - T(2.0)) * q - p + A[index]; + } + + return (log(T(0.5) * x) * modified_bessel_i1_forward(x) + T(0.5) * (a - p) / x) * exp(x); + } + + T b = B[0]; + + for (uint8_t index = 1; index < 25; index++) { + p = q; + q = b; + b = (T(8.0) / x - T(2.0)) * q - p + B[index]; + } + + return (T(0.5) * (b - p) / sqrt(x)); + } // T scaled_modified_bessel_k1_forward(T x) +); // scaled_modified_bessel_k1_string + +const auto shifted_chebyshev_polynomial_t_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_t_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + return cos(n * acos(x + x - T(1.0))); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_t_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_t_forward(T x, T n) { + return shifted_chebyshev_polynomial_t_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_t_forward(T x, T n) +); // shifted_chebyshev_polynomial_t_string + +const auto shifted_chebyshev_polynomial_u_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_u_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return n + 1; + } + + return -(n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0))) != T(0.0)) { + return sin((n + 1) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0))); + } + + return (n + 1) * cos((n + 1) * acos(x + x - T(1.0))) / (x + x - T(1.0)); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_u_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_u_forward(T x, T n) { + return shifted_chebyshev_polynomial_u_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_u_forward(T x, T n) +); // shifted_chebyshev_polynomial_u_string + +const auto shifted_chebyshev_polynomial_v_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_v_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return T(1.0); + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return (n + n + 1); + } + + return -(n + n + 1); + } + + if ((n > 6) && (abs(x + x - T(1.0)) < T(1.0))) { + if (sin(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return cos(((n) + T(0.5)) * acos(x + x - T(1.0))) / cos(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return n + n + 1; + } + + return -(n + n + 1); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) - T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_v_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_v_forward(T x, T n) { + return shifted_chebyshev_polynomial_v_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_v_forward(T x, T n) +); // shifted_chebyshev_polynomial_v_string + +const auto shifted_chebyshev_polynomial_w_string = jiterator_stringify( + template + T shifted_chebyshev_polynomial_w_forward(T x, int64_t n) { + if (n < 0) { + return T(0.0); + } + + if (x == T(1.0)) { + return n + n + 1; + } + + if (x == T(0.0)) { + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if ((n > 4) && (abs(x + x - T(1.0)) < T(1.0))) { + if (cos(acos(x + x - T(1.0)) / T(2.0)) != T(1.0)) { + return sin((n + T(0.5)) * acos(x + x - T(1.0))) / sin(acos(x + x - T(1.0)) / T(2.0)); + } + + if (n % 2 == 0) { + return T(1.0); + } + + return T(-1.0); + } + + if (n == 0) { + return T(1.0); + } + + if (n == 1) { + return x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + } + + T p = T(1.0); + T q = x + x - T(1.0) + (x + x - T(1.0)) + T(1.0); + T r; + + for (int64_t k = 2; (k <= n) && !isnan(q); k++) { + r = (x + x - T(1.0) + (x + x - T(1.0))) * q - p; + p = q; + q = r; + } + + return r; + } // shifted_chebyshev_polynomial_w_forward(T x, int64_t n) + + template + T shifted_chebyshev_polynomial_w_forward(T x, T n) { + return shifted_chebyshev_polynomial_w_forward(x, static_cast(n)); + } // shifted_chebyshev_polynomial_w_forward(T x, T n) +); // shifted_chebyshev_polynomial_w_string + +const auto spherical_bessel_j0_string = jiterator_stringify( + template + T spherical_bessel_j0_forward(T x) { + if (isinf(x)) { + return T(0.0); + } + + if (abs(x) < T(0.5)) { + return T(1.0) + x * x * (T(-1.0) / T(6.0) + x * x * (T(1.0) / T(120.0) + x * x * (T(-1.0) / T(5040.0) + x * x * (T(1.0) / T(362880.0) + x * x * (T(-1.0) / T(39916800.0) + x * x * (T(1.0) / T(6227020800.0))))))); + } + + return sin(x) / x; + } // T spherical_bessel_j0_forward(T x) +); // spherical_bessel_j0_string + +#else // !AT_USE_JITERATOR() -- kernels must be precompiled + +template +static inline C10_HOST_DEVICE scalar_t calc_gcd(scalar_t a_in, scalar_t b_in) { + scalar_t a = ::abs(a_in); + scalar_t b = ::abs(b_in); + while (a != 0) { + scalar_t c = a; + a = b % a; + b = c; + } + return b; +} + +/* + * For licensing information, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t calc_digamma(scalar_t in) { + // [C++ Standard Reference: Gamma Function] https://en.cppreference.com/w/cpp/numeric/math/tgamma + using accscalar_t = at::acc_type; + static constexpr double PI_f64 = 3.14159265358979323846; + constexpr accscalar_t PSI_10 = 2.25175258906672110764; + constexpr accscalar_t A[] = { + 8.33333333333333333333E-2, + -2.10927960927960927961E-2, + 7.57575757575757575758E-3, + -4.16666666666666666667E-3, + 3.96825396825396825397E-3, + -8.33333333333333333333E-3, + 8.33333333333333333333E-2, + }; + + accscalar_t x = static_cast(in); + if (x == 0) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is ±0, ±∞ is returned + return std::copysign(static_cast(INFINITY), -x); + } + + bool x_is_integer = x == ::trunc(x); + accscalar_t result = 0; + if (x < 0) { + if (x_is_integer) { + // As per C++ standard for gamma related functions and SciPy, + // If the argument is a negative integer, NaN is returned + return static_cast(NAN); + } + // Extracts the fractional part of x as r, since tan(pi * r) is more numerically + // accurate than tan(pi * x). While these operations are mathematically equivalent + // since both x and r are in radians and tan() has a periodicity of pi, in practice + // the computation of pi * x is a source of error (when |x| > 1). + double q, r; + r = ::modf(static_cast(x), &q); + result = static_cast(- PI_f64 / ::tan(PI_f64 * r)); + x = 1 - x; + } + + while (x < 10) { + result -= 1 / x; + x += 1; + } + if (x == 10) { + return static_cast(result + PSI_10); + } + + accscalar_t y = 0; + if (x < 1.0e17) { + accscalar_t z = 1 / (x * x); + + accscalar_t polevl_result = 0; + for (int i = 0; i <= 6; i++) { + polevl_result = polevl_result * z + A[i]; + } + y = z * polevl_result; + } + + return static_cast(::log(x) - (static_cast(0.5) / x) - y + result); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_trigamma(scalar_t in) { + using accscalar_t = at::acc_type; + const accscalar_t PI = 3.14159265358979323846; + accscalar_t x = static_cast(in); + accscalar_t sign = +1; + accscalar_t result = 0; + if (x < 0.5f) { + sign = -1; + accscalar_t sin_pi_x = ::sin(PI * x); + result -= (PI * PI) / (sin_pi_x * sin_pi_x); + x = 1 - x; + } + for (int i = 0; i < 6; ++i) { + result += 1 / (x * x); + x += 1; + } + const accscalar_t one = static_cast(1); + const accscalar_t ixx = 1 / (x*x); + result += (1 + 1 / (2*x) + ixx * (one/6 - ixx * (one/30 - ixx * (one/42)))) / x; + return static_cast(sign * result); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +static inline C10_HOST_DEVICE scalar_t +chbevl(scalar_t _x, const scalar_t array[], size_t len) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + + scalar_t b0, b1, b2; + + b0 = array[0]; + b1 = 0; + + for (size_t i = 1; i < len; ++i) { + b2 = b1; + b1 = b0; + b0 = _x * b1 - b2 + array[i]; + } + + return (0.5 * (b0 - b2)); +} + +/* + * For licensing information and documentation, please refer to the cpu implementation located in "ATen/native/Math.h". + */ +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_A() { + /* Chebyshev coefficients for exp(-x) I0(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I0(x) } = 1. + */ + static const T coefficients[] = { + -4.41534164647933937950E-18, 3.33079451882223809783E-17, + -2.43127984654795469359E-16, 1.71539128555513303061E-15, + -1.16853328779934516808E-14, 7.67618549860493561688E-14, + -4.85644678311192946090E-13, 2.95505266312963983461E-12, + -1.72682629144155570723E-11, 9.67580903537323691224E-11, + -5.18979560163526290666E-10, 2.65982372468238665035E-9, + -1.30002500998624804212E-8, 6.04699502254191894932E-8, + -2.67079385394061173391E-7, 1.11738753912010371815E-6, + -4.41673835845875056359E-6, 1.64484480707288970893E-5, + -5.75419501008210370398E-5, 1.88502885095841655729E-4, + -5.76375574538582365885E-4, 1.63947561694133579842E-3, + -4.32430999505057594430E-3, 1.05464603945949983183E-2, + -2.37374148058994688156E-2, 4.93052842396707084878E-2, + -9.49010970480476444210E-2, 1.71620901522208775349E-1, + -3.04682672343198398683E-1, 6.76795274409476084995E-1}; + + return std::make_tuple(coefficients, 30); +} + +template +C10_HOST_DEVICE inline std::tuple chebyshev_coefficients_i0e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I0(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I0(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + -7.23318048787475395456E-18, -4.83050448594418207126E-18, + 4.46562142029675999901E-17, 3.46122286769746109310E-17, + -2.82762398051658348494E-16, -3.42548561967721913462E-16, + 1.77256013305652638360E-15, 3.81168066935262242075E-15, + -9.55484669882830764870E-15, -4.15056934728722208663E-14, + 1.54008621752140982691E-14, 3.85277838274214270114E-13, + 7.18012445138366623367E-13, -1.79417853150680611778E-12, + -1.32158118404477131188E-11, -3.14991652796324136454E-11, + 1.18891471078464383424E-11, 4.94060238822496958910E-10, + 3.39623202570838634515E-9, 2.26666899049817806459E-8, + 2.04891858946906374183E-7, 2.89137052083475648297E-6, + 6.88975834691682398426E-5, 3.36911647825569408990E-3, + 8.04490411014108831608E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i0(scalar_t _x) { + static_assert(!std::is_same() && !std::is_same(), "don't instantiate with low precision type"); + // Upcast input for numerical accuracy purposes + // Needed for accurate results if input is bfloat16 or float16 + scalar_t x = ::abs(_x); + + if (x <= scalar_t{8.0}) { + auto [A, len] = chebyshev_coefficients_i0e_A(); + scalar_t y = (x / scalar_t{2.0}) - scalar_t{2.0}; + return (::exp(x) * chbevl(y, A, len)); + } + + auto [B, len] = chebyshev_coefficients_i0e_B(); + return (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x)); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coefficients[] = { + 2.77791411276104639959E-18, -2.11142121435816608115E-17, + 1.55363195773620046921E-16, -1.10559694773538630805E-15, + 7.60068429473540693410E-15, -5.04218550472791168711E-14, + 3.22379336594557470981E-13, -1.98397439776494371520E-12, + 1.17361862988909016308E-11, -6.66348972350202774223E-11, + 3.62559028155211703701E-10, -1.88724975172282928790E-9, + 9.38153738649577178388E-9, -4.44505912879632808065E-8, + 2.00329475355213526229E-7, -8.56872026469545474066E-7, + 3.47025130813767847674E-6, -1.32731636560394358279E-5, + 4.78156510755005422638E-5, -1.61760815825896745588E-4, + 5.12285956168575772895E-4, -1.51357245063125314899E-3, + 4.15642294431288815669E-3, -1.05640848946261981558E-2, + 2.47264490306265168283E-2, -5.29459812080949914269E-2, + 1.02643658689847095384E-1, -1.76416518357834055153E-1, + 2.52587186443633654823E-1}; + + return std::make_tuple(coefficients, 29); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_A() { + /* Chebyshev coefficients for exp(-x) I1(x) + * in the interval [0,8]. + * + * lim(x->0){ exp(-x) I1(x) / x } = 1/2. + */ + static const T coeff[] = { + 9.38153738649577178388E-9f, + -4.44505912879632808065E-8f, + 2.00329475355213526229E-7f, + -8.56872026469545474066E-7f, + 3.47025130813767847674E-6f, + -1.32731636560394358279E-5f, + 4.78156510755005422638E-5f, + -1.61760815825896745588E-4f, + 5.12285956168575772895E-4f, + -1.51357245063125314899E-3f, + 4.15642294431288815669E-3f, + -1.05640848946261981558E-2f, + 2.47264490306265168283E-2f, + -5.29459812080949914269E-2f, + 1.02643658689847095384E-1f, + -1.76416518357834055153E-1f, + 2.52587186443633654823E-1f}; + return std::make_tuple(coeff, 17); +}; + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coefficients[] = { + 7.51729631084210481353E-18, 4.41434832307170791151E-18, + -4.65030536848935832153E-17, -3.20952592199342395980E-17, + 2.96262899764595013876E-16, 3.30820231092092828324E-16, + -1.88035477551078244854E-15, -3.81440307243700780478E-15, + 1.04202769841288027642E-14, 4.27244001671195135429E-14, + -2.10154184277266431302E-14, -4.08355111109219731823E-13, + -7.19855177624590851209E-13, 2.03562854414708950722E-12, + 1.41258074366137813316E-11, 3.25260358301548823856E-11, + -1.89749581235054123450E-11, -5.58974346219658380687E-10, + -3.83538038596423702205E-9, -2.63146884688951950684E-8, + -2.51223623787020892529E-7, -3.88256480887769039346E-6, + -1.10588938762623716291E-4, -9.76109749136146840777E-3, + 7.78576235018280120474E-1}; + + return std::make_tuple(coefficients, 25); +} + +template +C10_HOST_DEVICE inline + typename std::enable_if_t, std::tuple> + chebyshev_coefficients_i1e_B() { + /* Chebyshev coefficients for exp(-x) sqrt(x) I1(x) + * in the inverted interval [8,infinity]. + * + * lim(x->inf){ exp(-x) sqrt(x) I1(x) } = 1/sqrt(2pi). + */ + static const T coeff[] = { + -3.83538038596423702205E-9f, + -2.63146884688951950684E-8f, + -2.51223623787020892529E-7f, + -3.88256480887769039346E-6f, + -1.10588938762623716291E-4f, + -9.76109749136146840777E-3f, + 7.78576235018280120474E-1f}; + + return std::make_tuple(coeff, 7); +}; + +template +static inline C10_HOST_DEVICE scalar_t calc_i1(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto [A, len] = chebyshev_coefficients_i1e_A(); + scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = ::exp(x) * x * chbevl(y, A, len); + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto [B, len] = chebyshev_coefficients_i1e_B(); + const scalar_t out = (::exp(x) * chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len)) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +template +static inline C10_HOST_DEVICE scalar_t calc_i1e(scalar_t _x) { + const auto x = ::abs(_x); + if (x <= scalar_t{8.0}) { + auto [A, len] = chebyshev_coefficients_i1e_A(); + const scalar_t y = x / scalar_t{2.0} - scalar_t{2.0}; + const scalar_t out = chbevl(y, A, len) * x; + return (_x < scalar_t{0.0}) ? -out : out; + } + + auto [B, len] = chebyshev_coefficients_i1e_B(); + const scalar_t out = chbevl(scalar_t{32.0} / x - scalar_t{2.0}, B, len) / ::sqrt(x); + return (_x < scalar_t{0.0}) ? -out : out; +} + +#endif // AT_USE_JITERATOR() (this closes the "else" branch of a if/else preprocessor directive) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh new file mode 100644 index 0000000000000000000000000000000000000000..46b93efea99c145881a29ce2e70df76b06eaf8ef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh @@ -0,0 +1,687 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// References: +// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/ + +namespace at::native::memory { + +namespace detail { + +// What does the `static_unroll` do? +// +// We want to do something like: +// +// using args_t = typename traits::ArgsTuple; +// args_t args; +// #pragma unroll +// for (int i = 0; i < traits::arity; i++) { +// std::get(args) = .... +// } +// +// but unfortunately the above code does not work because +// the template argument has to be a compile time constant +// so `static_unroll` is created to simulate `#pragma unroll` +// using template metaprogramming. + +template typename func, int end, int current=0> +struct static_unroll { + template + static inline C10_HOST_DEVICE void with_args(Args&&... args) { + func::apply(std::forward(args)...); + static_unroll::with_args(args...); + } +}; + +template typename func, int end> +struct static_unroll { + template + static inline C10_HOST_DEVICE void with_args(Args... /*args*/) {} +}; + +// helper structs to be used with static_unroll to load arguments +// one by one + +template +struct vectorized_load_helper { + template + static __device__ void apply(policy_t &self, args_t *args, int idx, int block_work_size) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + auto ptr = reinterpret_cast(self.data[arg_index + 1]) + block_work_size * idx; + auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get(args[thread_unroll_idx]); }; + self.load_single_arg(args_accessor, ptr); + } +}; + +#ifdef USE_ROCM +// Templated version of vectorized load helper. +// It can be used on heterogeneous input tensor element types. +template +struct vectorized_templated_load_helper { + template + static __device__ void apply(policy_t& self, args_t* args, int idx) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + + // Delay pointer arithmetic to the policy loader where we know the actual + // type of the current argument. + char* ptr = (self.data[arg_index + 1]); + auto args_accessor = [&args] __device__(int thread_unroll_idx) -> arg_t& { + return std::get(args[thread_unroll_idx]); + }; + self.template load_single_arg(args_accessor, ptr, idx); + } +}; +#endif + +template +struct unroll_load_helper { + template + static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + std::get(args[j]) = loader.template load(self.data[arg_index + num_outputs], offset[arg_index], arg_index); + } +}; + +template +struct multi_outputs_store_helper { + template + C10_HOST_DEVICE static void apply( + const data_t& data, + const offsets_t& offsets, + thrust::tuple ret) { + using T = typename thrust::tuple_element>::type; + T *to = reinterpret_cast(data[current]) + offsets[current]; + *to = thrust::get(ret); + } +}; + +} // namespace detail + +struct LoadWithoutCast { + template + __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { + return c10::load(reinterpret_cast(base_ptr) + offset); + } +}; + +template +struct LoadWithCast { + using array_t = std::array(N, 1)>; + using size_array_t = std::array(N, 1)>; + + array_t dtypes; + size_array_t element_sizes; + + LoadWithCast(const TensorIteratorBase& iter) { + CUDA_KERNEL_ASSERT(iter.ninputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i + iter.noutputs()); + element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs())); + } + } + + template + __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { + void *ptr = base_ptr + element_sizes[arg] * offset; + return c10::fetch_and_cast(dtypes[arg], ptr); + } +}; + +struct StoreWithoutCast { + template + __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) { + *(reinterpret_cast(base_ptr) + offset) = value; + } +}; + +template +struct StoreWithCast { + using array_t = std::array(N, 1)>; + using size_array_t = std::array(N, 1)>; + + array_t dtypes; + size_array_t element_sizes; + + StoreWithCast(const TensorIteratorBase& iter) { + CUDA_KERNEL_ASSERT(iter.noutputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i); + element_sizes[i] = c10::elementSize(iter.dtype(i)); + } + } + + template + __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) { + void *ptr = base_ptr + element_sizes[arg] * offset; + c10::cast_and_store(dtypes[arg], ptr, value); + } +}; + +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; +}; + +template +__device__ aligned_vector load_vector(const scalar_t *base_ptr, uint32_t offset) { + using vec_t = aligned_vector; + auto *from = reinterpret_cast(base_ptr); +#if defined(USE_ROCM) && defined(__gfx942__) + using longx2 = __attribute__((__vector_size__(4*sizeof(int)))) int; + if constexpr (sizeof(vec_t) == sizeof(int)) { + union { + vec_t v; + int i; + } tmpt = { .i = __builtin_nontemporal_load(reinterpret_cast(&(from[offset]))) }; + return tmpt.v; + } + else if constexpr (sizeof(vec_t) == sizeof(long)) { + union { + vec_t v; + long i; + } tmpt = { .i = __builtin_nontemporal_load(reinterpret_cast(&(from[offset]))) }; + return tmpt.v; + } + else if constexpr (sizeof(vec_t) == sizeof(longx2)) { + union { + vec_t v; + longx2 i; + } tmpt = { .i = __builtin_nontemporal_load(reinterpret_cast(&(from[offset]))) }; + return tmpt.v; + } +#endif + return from[offset]; +} + +template +__device__ aligned_vector load_vector(const bool *base_ptr, uint32_t offset) { + // See NOTE [Loading boolean values] + auto tmp = load_vector(reinterpret_cast(base_ptr), offset); + aligned_vector ret; + for (int i = 0; i < vec_size; ++i) { + ret.val[i] = bool(tmp.val[i]); + } + return ret; +} + +namespace policies { + +template < + int num_threads, + typename data_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t, + int elems_per_thread, + int num_outputs = 1> +struct unroll_base { + data_t data; + int remaining; + inp_calc_t input_offset_calculator; + out_calc_t output_offset_calculator; + loader_t loader; + storer_t storer; + static constexpr int tws = elems_per_thread; + static constexpr int block_work_size = elems_per_thread * num_threads; + + __device__ unroll_base( + data_t data, + int remaining, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s) + : data(data), + remaining(remaining), + input_offset_calculator(ic), + output_offset_calculator(oc), + loader(l), + storer(s) {} + + __device__ inline bool check_inbounds(int thread_work_elem) { + return ((int)(threadIdx.x + thread_work_elem * num_threads) < remaining); + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size_v; + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < elems_per_thread; i++) { + if (thread_idx < remaining) { + int linear_idx = thread_idx + block_work_size * idx; + auto offset = input_offset_calculator.get(linear_idx); + detail::static_unroll::with_args( + *this, args, offset, loader, i, num_outputs); + thread_idx += num_threads; + } + } + } + + template + __device__ inline void store(scalar_t *from, int idx) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < elems_per_thread; i++) { + if (thread_idx < remaining) { + int linear_idx = thread_idx + block_work_size * idx; + int offset = output_offset_calculator.get(linear_idx)[0]; + storer.store(from[i], data[0], offset); + thread_idx += num_threads; + } + } + } +}; + +// Utility type for all users of unroll that extract the num_threads value from +// the caller scope. +template < + typename data_t, + typename inp_calc_t, + typename out_calc_t, + typename loader_t, + typename storer_t, + int elems_per_thread, + int num_outputs = 1> +using unroll = unroll_base< + num_threads(), + data_t, + inp_calc_t, + out_calc_t, + loader_t, + storer_t, + elems_per_thread, + num_outputs>; + +template // vec_size: number of scalars, can be 1, 2, or 4. +struct vectorized { + + static_assert(elems_per_thread % vec_size == 0, "The workload per thread must be a multiple of vec_size"); + static constexpr int loop_size = elems_per_thread / vec_size; + static constexpr int tws = elems_per_thread; + + data_t data; + + __device__ vectorized(data_t data) : data(data) {} + + __device__ inline constexpr bool check_inbounds(int thread_work_elem) { + return true; + } + + template + __device__ inline void load_single_arg(accessor_t to, scalar_t *from) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads(); + auto v = load_vector(from, index); + #pragma unroll + for (int j = 0; j < vec_size; j++) { + to(vec_size * i + j) = v.val[j]; + } + } + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size_v; + detail::static_unroll::with_args(*this, args, idx, elems_per_thread * num_threads()); + } + + template + __device__ inline void store(scalar_t *from, int idx) { + using vec_t = aligned_vector; + scalar_t *to = reinterpret_cast(data[0]) + elems_per_thread * num_threads() * idx; + vec_t *to_ = reinterpret_cast(to); + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads(); + vec_t v; + for (int j = 0; j < vec_size; j++) { + v.val[j] = from[vec_size * i + j]; + } + to_[index] = v; + } + } +}; + +#ifdef USE_ROCM +// This is similar to vectorized policy above, but this one supports +// heterogeneous input tensor types as templated parameters. +// Its use should be limited to frequently used heterogeneous data types +// as each instantiation will generate a separate kernel, leading to code +// bloating if applied to all combinations supported in PyTorch. Assumption: all +// tensors are contiguous, that is: stride == sizeof(type) for all tensors. +template < + int vec_size, + typename data_t, + int elems_per_thread, + int num_threads, + typename CastToT, + typename... CastFromTs> // vec_size: number of scalars, can be 1, 2, or 4. +struct vectorized_templated { + static_assert( + elems_per_thread % vec_size == 0, + "The workload per thread must be a multiple of vec_size"); + static constexpr int loop_size = elems_per_thread / vec_size; + static constexpr int tws = elems_per_thread; + static constexpr int block_work_size = elems_per_thread * num_threads; + data_t data; + + __device__ vectorized_templated(data_t data) : data(data) {} + + __device__ inline constexpr bool check_inbounds(int thread_work_elem) { + return true; + } + + template + __device__ inline void load_single_arg(accessor_t to, char* ptr, int idx) { + // extract the arg_index-th input tensor element type from the + // variadic template argument. + using CastFromT = + std::tuple_element_t>; + // Delayed pointer arithmetic from the caller: this is the place + // where we know the type of the argument. + CastFromT* block_ptr = + reinterpret_cast(ptr) + block_work_size * idx; + int thread_idx = threadIdx.x; +#pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads; + auto v = load_vector(block_ptr, index); +#pragma unroll + for (int j = 0; j < vec_size; j++) { + to(vec_size * i + j) = c10::convert(v.val[j]); + } + } + } + + template + __device__ inline void load(args_t* args, int idx) { + constexpr int arity = std::tuple_size::value; + detail::static_unroll:: + with_args(*this, args, idx); + } + + // Assume for now that from (temporary array per thread) is of the same + // type as to (destination tensor), which is the case for + // float(float,bfloat16) and functor add on float(float,float). + template + __device__ inline void store(scalar_t* from, int idx) { + using vec_t = aligned_vector; + CastToT* to = reinterpret_cast(data[0]) + block_work_size * idx; + vec_t* to_ = reinterpret_cast(to); + int thread_idx = threadIdx.x; +#pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads; + vec_t v; + for (int j = 0; j < vec_size; j++) { + v.val[j] = from[vec_size * i + j]; + } + to_[index] = v; + } + } +}; +#endif + +template +struct multi_outputs_unroll { + //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct + //we don't use inheritance because of compiler bug in cuda 10.2+ + data_t data; + int remaining; + inp_calc_t input_offset_calculator; + out_calc_t output_offset_calculator; + LoadWithoutCast loader; + StoreWithoutCast storer; + static constexpr int tws = thread_work_size(); + + __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc): + data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {} + + __device__ inline bool check_inbounds(int thread_work_elem) { + return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining); + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size_v; + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offset = input_offset_calculator.get(linear_idx); + detail::static_unroll::with_args(*this, args, offset, loader, i, num_outputs); + thread_idx += num_threads(); + } + } + + + template + __device__ inline void store(return_t *from, int idx) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= this->remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offsets = this->output_offset_calculator.get(linear_idx); + memory::detail::static_unroll::with_args(this->data, offsets, from[i]); + thread_idx += num_threads(); + } + } +}; + +} // namespace policies + +// This is only used in host, but we will wrap this into some templates +// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE +// in order to compile +template +inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec2_alignment = std::alignment_of_v>; + constexpr int vec4_alignment = std::alignment_of_v>; + constexpr int vec8_alignment = std::alignment_of_v>; +#ifdef USE_ROCM + constexpr int vec16_alignment = std::alignment_of_v>; + constexpr int type_size = sizeof(scalar_t); + if (type_size == 1 && (address % vec16_alignment == 0)) { + return 16; + } else if (type_size <= 2 && (address % vec8_alignment == 0)) { + return 8; + } else +#else + if (address % vec8_alignment == 0) { + return 8; + } else +#endif + if (address % vec4_alignment == 0) { + return 4; + } else if (address % vec2_alignment == 0) { + return 2; + } + return 1; +} + +template +inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) { + return can_vectorize_up_to(static_cast(pointer)); +} + +template +struct can_vectorize_up_to_helper { + template + static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits /*_*/) { + using arg_t = typename traits::template arg::type; + // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + result = std::min(result, can_vectorize_up_to(pointers[i + 1])); + } +}; + +template +inline int can_vectorize_up_to(array_t pointers) { + using traits = function_traits; + using return_t = typename traits::result_type; + constexpr int arity = traits::arity; + int result = can_vectorize_up_to(pointers[0]); + // We need to get the type for each argument of `func_t`, this can only + // be done at compile time. + detail::static_unroll::with_args(result, pointers, traits()); + return result; +} + + + +template +__inline__ size_t get_alignment(T ptr_or_size) { + auto val = reinterpret_cast(ptr_or_size); + if (val % 16 == 0) { + return 16; + } else if (val % 8 == 0) { + return 8; + } else if (val % 4 == 0) { + return 4; + } else if (val % 2 == 0) { + return 2; + } else { + return 1; + } +} + +template <> +__inline__ size_t get_alignment(size_t size) { + return get_alignment(reinterpret_cast(size)); +} + +template +inline constexpr bool dependent_bool_value = Value; + +template +inline constexpr bool dependent_false = dependent_bool_value; + +template +union Vec; + +template <> +union Vec<4> { + uint16_t u16[2]; + uint32_t u32, as_scalar; + float f32; +}; + +template <> +union Vec<8> { + uint16_t u16[4]; + uint32_t u32[2]; + uint64_t u64, as_scalar; + float f32[2]; +}; + +template <> +union alignas(16) Vec<16> { + uint16_t u16[8]; + uint32_t u32[4]; + uint64_t u64[2]; + uint4 u128, as_scalar; + float f32[4]; +}; + +template +__device__ __inline__ Vec ld_vec(const T* addr) { + Vec vec; + if constexpr (Alignment == 16) { +#if defined(USE_ROCM) + vec.u128 = *reinterpret_cast(addr); + } else if constexpr (Alignment == 8) { + vec.u64 = *reinterpret_cast(addr); + } else if constexpr (Alignment == 4) { + vec.u32 = *reinterpret_cast(addr); +#else + asm("ld.global.v4.u32 {%0,%1,%2,%3}, [%4];" + : "=r"(vec.u32[0]), "=r"(vec.u32[1]), "=r"(vec.u32[2]), "=r"(vec.u32[3]) + : "l"(addr) + : "memory"); + } else if constexpr (Alignment == 8) { + asm("ld.global.v2.u32 {%0,%1}, [%2];" + : "=r"(vec.u32[0]), "=r"(vec.u32[1]) + : "l"(addr) + : "memory"); + } else if constexpr (Alignment == 4) { + asm("ld.global.u32 %0, [%1];" : "=r"(vec.u32) : "l"(addr) : "memory"); +#endif + } else { + static_assert(dependent_false); + } + return vec; +} + +template +__device__ __inline__ void st_vec(T* addr, const Vec& vec) { + if constexpr (Alignment == 16) { +#if defined(USE_ROCM) + reinterpret_cast(addr)[0] = vec.u64[0]; + reinterpret_cast(addr)[1] = vec.u64[1]; + } else if constexpr (Alignment == 8) { + *reinterpret_cast(addr) = vec.u64; + } else if constexpr (Alignment == 4) { + *reinterpret_cast(addr) = vec.u32; +#else + asm("st.global.v4.u32 [%0], {%1,%2,%3,%4};" + : + : "l"(addr), + "r"(vec.u32[0]), + "r"(vec.u32[1]), + "r"(vec.u32[2]), + "r"(vec.u32[3]) + : "memory"); + } else if constexpr (Alignment == 8) { + asm("st.global.v2.u32 [%0], {%1,%2};" + : + : "l"(addr), "r"(vec.u32[0]), "r"(vec.u32[1]) + : "memory"); + } else if constexpr (Alignment == 4) { + asm("st.global.u32 [%0], %1;" : : "l"(addr), "r"(vec.u32) : "memory"); +#endif + } else { + static_assert(dependent_false); + } +} + + + +} // namespace at::native::memory + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MiscUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MiscUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..e76505cedc08bfdc3e5ed83625a9b5187af3895a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MiscUtils.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + + +namespace at::native { + +static inline int cuda_int_cast(int64_t value, const char* varname) { + auto result = static_cast(value); + TORCH_CHECK(static_cast(result) == value, + "cuda_int_cast: The value of ", varname, "(", (long long)value, + ") is too large to fit into a int (", sizeof(int), " bytes)"); + return result; +} + +// Creates an array of size elements of type T, backed by pinned memory +// wrapped in a Storage +template +static inline Storage pin_memory(int64_t size) { + auto* allocator = cuda::getPinnedMemoryAllocator(); + int64_t adjusted_size = size * sizeof(T); + return Storage( + Storage::use_byte_size_t(), + adjusted_size, + allocator, + /*resizable=*/false); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh new file mode 100644 index 0000000000000000000000000000000000000000..37b4b11a6d9607139b18dfe2f6ef51a9fc5b4685 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh @@ -0,0 +1,387 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at::native { + +namespace { + +static constexpr int64_t kILP = 4; +static constexpr int64_t kChunkSize = 65536; +static constexpr int64_t kBlockSize = 512; + +// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy` +// TensorListMetadata has to be < 4KB - the limit for kernel launch argument +static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; +static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; +static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30}; +static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = { + 72, + 60}; + +template +__device__ __forceinline__ bool is_aligned(T* p) { + return ((uint64_t)p) % (kILP * sizeof(T)) == 0; +} + +template +__device__ __forceinline__ void load_store( + T* dst, + T* src, + int64_t dst_offset, + int64_t src_offset) { + using LT = at::native::memory::aligned_vector; + ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset]; +} + +template +struct TensorListMetadata { + const void* addresses[n][depth_to_max_tensors[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; + int start_tensor_this_launch; +}; + +template +struct TensorListScalarListMetadata { + const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]]; + scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; +}; + +// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of +// 4kb with `c10::complex` +template <> +struct TensorListScalarListMetadata, 1> { + const void* addresses[1] + [depth_to_max_tensors_scalarlist_of_complex_double[0]]; + int64_t + numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]]; + c10::complex + scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]]; + unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]]; + int block_to_chunk[depth_to_max_blocks[1 - 1]]; +}; + +template <> +struct TensorListScalarListMetadata, 2> { + const void* addresses[2] + [depth_to_max_tensors_scalarlist_of_complex_double[1]]; + int64_t + numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]]; + c10::complex + scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]]; + unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]]; + int block_to_chunk[depth_to_max_blocks[2 - 1]]; +}; + +// NOTE(crcrpar): This is a conservative resolution to handle `state_steps` +// whose each element is `at::Tensor` of 1 element representing the number of +// `step`s called so far. +template +struct FusedOptimizerTensorListMetadata { + const void* addresses[n][depth_to_max_tensors[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors[n - 1]]; + const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; + int start_tensor_this_launch; +}; + +template +C10_LAUNCH_BOUNDS_1(kBlockSize) +__global__ void multi_tensor_apply_kernel( + T tensorListMeta, + U callable, + ArgTypes... args) { + // Hand the chunk information to the user-supplied functor to process however + // it likes. + callable(kChunkSize, tensorListMeta, args...); +} + +} // namespace + +// multi_tensor_apply enables horizontal fusion across lists of tensors. +// For example, whereas you once had a for-loop of a + b = c, where a, b, +// and c are individual tensors in lists as, bs, and cs, you can now with +// fewer kernel launches compute as + bs = cs. +// +// You can also imagine bs to be a scalar list vs a tensor list. +// +// The function below takes in tensor lists, scalars, and a callable and +// chunks up the computation to launch as few kernels as possible by iterating +// through every "chunk" in every tensor (thus the nested for loops). In the +// simplest case, everything gets bundled into just one kernel launch, but +// due to blocksize constraints, we may need to launch multiple kernels. +// Each kernel launch is defined by one tensorListMeta construct, which we +// use to track and reset the necessary metadata for each launch. +template +void multi_tensor_apply( + std::vector>& tensor_lists, + at::ArrayRef scalars, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth."); + const size_t n_tensors = tensor_lists[0].size(); + using scalar_vals_t = typename T::opmath_t; + TensorListScalarListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (size_t t = 0; t < n_tensors; t++) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][t].numel() == 0) { + continue; + } + tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to(); + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][t].const_data_ptr(); + } + loc_tensor_info++; + + // now we enter [chunking territory]. + // we will launch a kernel when EITHER the blocks get filled up OR + // the tensors get filled up. There will always be at least one block + // per tensor since the zero-sized ones will not enter the loop, so + // the nested forloop within represents iterating through the chunks + // of a single tensor. + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + for (auto chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + // a tensor is not considered full unless all its chunks have been + // processed + const bool tensors_full = + (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] && + chunk == chunks - 1); + const bool blocks_full = + (loc_block_info == depth_to_max_blocks[depth - 1]); + + if (tensors_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + // all chunks have already been handled in the kernel + if (chunk == chunks - 1) { + loc_tensor_info = 0; + } else { // blocks were full and tensor chunks remain + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + tensorListMeta.scalar_vals[0] = + tensorListMeta.scalar_vals[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + } + } + } + } + + // note: [finishing what we started] + // if there's remaining work to be done but the tensors/blocks aren't full + // yet we are at the end, submit the kernel to do the work! + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +template +void multi_tensor_apply( + std::vector>& tensor_lists, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth."); + const size_t n_tensors = tensor_lists[0].size(); + TensorListMetadata tensorListMeta; + tensorListMeta.start_tensor_this_launch = 0; + + int loc_block_info = 0; + int loc_tensor_info = 0; + int processed = 0; + + for (size_t t = 0; t < n_tensors; t++) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][t].numel() == 0) { + continue; + } + processed++; + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][t].const_data_ptr(); + } + loc_tensor_info++; + + // see note: [chunking territory]. + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + for (auto chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + const bool tensors_full = + (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks - 1); + const bool blocks_full = + (loc_block_info == depth_to_max_blocks[depth - 1]); + + if (tensors_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + if (chunk == chunks - 1) { + loc_tensor_info = 0; + tensorListMeta.start_tensor_this_launch = processed; + } else { + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + tensorListMeta.start_tensor_this_launch = processed - 1; + } + } + } + } + + // see note: [finishing what we started] + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +template +void multi_tensor_apply_for_fused_optimizer( + std::vector>& tensor_lists, + at::TensorList state_steps, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth"); + const auto num_tensors = tensor_lists[0].size(); + FusedOptimizerTensorListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (const auto& tensor_index : c10::irange(num_tensors)) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][tensor_index].numel() == 0) { + continue; + } + tensorListMeta.state_steps_addresses[loc_tensor_info] = + state_steps[tensor_index].const_data_ptr(); + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][tensor_index].numel(); + for (const auto& d : c10::irange(depth)) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][tensor_index].const_data_ptr(); + } + loc_tensor_info++; + + // see above note: [chunking territory] + const auto numel = tensor_lists[0][tensor_index].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + TORCH_CHECK(chunks > -1); + for (const auto& chunk : c10::irange(chunks)) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + const auto tensor_full = + (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks - 1); + const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1]; + + if (tensor_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + if (chunk == chunks - 1) { + loc_tensor_info = 0; + } else { + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + tensorListMeta.state_steps_addresses[0] = + tensorListMeta.state_steps_addresses[loc_tensor_info - 1]; + for (const auto& d : c10::irange(depth)) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + } + } + } + } + + // see above note: [finishing what we've started] + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Normalization.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Normalization.cuh new file mode 100644 index 0000000000000000000000000000000000000000..bfd2d30570b6d2f4b96f88aa4ca70a4cd9f4bae5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Normalization.cuh @@ -0,0 +1,1778 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif + +namespace at::native { + +// The maximum number of threads in a block +#if defined(USE_ROCM) +constexpr int MAX_BLOCK_SIZE = 1024; +#else +constexpr int MAX_BLOCK_SIZE = 512; +#endif + +constexpr unsigned MAX_GRID_SIZE = 65535u; + +// Number of threads in a block given an input size up to MAX_BLOCK_SIZE +static int getNumThreads(int nElem) { +#if defined(USE_ROCM) + int threadSizes[5] = { 64, 128, 256, 512, MAX_BLOCK_SIZE }; +#else + int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE }; +#endif + for (int i = 0; i != 5; ++i) { + if (nElem <= threadSizes[i]) { + return threadSizes[i]; + } + } + return MAX_BLOCK_SIZE; +} + +// Returns the index of the most significant 1 bit in `val`. +__device__ __forceinline__ int getMSB(int val) { + return 31 - __clz(val); +} + +template +struct Float2 { + accscalar_t v1, v2; + __device__ Float2() {} + __device__ Float2(scalar_t v1, scalar_t v2) : v1(static_cast(v1)), v2(static_cast(v2)) {} + __device__ Float2(int v) : v1(static_cast(v)), v2(static_cast(v)) {} + __device__ Float2& operator+=(const Float2& a) { + v1 += a.v1; + v2 += a.v2; + return *this; + } + __device__ friend Float2 operator+(Float2 a, const Float2& b) { + a += b; + return a; + } +}; + +template +struct GradOp { + __device__ GradOp(accscalar_t m, const PTA& i, const PTA& g) + : mean(m), input(i), grad_output(g) {} + __device__ __forceinline__ Float2 operator()(int batch, int plane, int n) { + accscalar_t g = grad_output[batch][plane][n]; + accscalar_t c = static_cast(input[batch][plane][n]) - mean; + return Float2(g, g * c); + } + const accscalar_t mean; + const PTA& input; + const PTA& grad_output; +}; + +template +struct SumReduceOp { + __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; } + + __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +}; + +template +struct SumReduceOp> { + using acc_t = Float2; + + __device__ __forceinline__ acc_t combine(acc_t a, acc_t b) const { return a + b; } + + __device__ __forceinline__ acc_t warp_shfl_down(acc_t data, int offset) const { + return {WARP_SHFL_DOWN(data.v1, offset), WARP_SHFL_DOWN(data.v2, offset)}; + } +}; + +// Sum across (batch, x/y/z) applying Op() pointwise +// this works by first having each thread sum it's part +// of the data. Then there is a double-shuffling reduction. +// First each warp (of C10_WARP_SIZE threads) uses warpSum to reduce its +// data to the "warp leader", who writes its value into shared memory. +// Then a single warp reads the remaining (at most C10_WARP_SIZE) items +// and reduces them using another warpSum. +// The implicit assumption is that there are no more +// than C10_WARP_SIZE**2 threads. +template +__device__ scalar_t reduce(Op op, PTA tensor, int plane) { + // first the reductions each thread does separately + scalar_t sum = static_cast(0); + for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) { +#if defined(USE_ROCM) + constexpr int UNRL = 4; // load deserilize factor + scalar_t tmp[UNRL]; + for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) { +#pragma unroll + for (int u = 0; u < UNRL; u++) + tmp[u] = op(batch, plane, std::min((int)tensor.size(2)-1, (int)(x+u*blockDim.x))); +#pragma unroll + for (int u = 0; u < UNRL; u++) + if (x+u*blockDim.x < tensor.size(2)) + sum += tmp[u]; + } +#else + for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) { + sum += op(batch, plane, x); + } +#endif + } + __shared__ scalar_t shared[C10_WARP_SIZE]; + SumReduceOp reduce_op; + sum = cuda_utils::BlockReduce, cuda_utils::Block2D>(sum, reduce_op, 0, shared); + if (threadIdx.x == 0 && threadIdx.y == 0) { + shared[0] = sum; + } + __syncthreads(); + // Everyone picks it up, should be broadcast into the whole grad_input + return shared[0]; +} + +constexpr int ELEMENTS_PER_ITER = 4; // enables concurrency within each thread to hide latency +constexpr int ELEMENTS_PER_THREAD = 16; +constexpr int OPTIMAL_TILE_W = 32; +constexpr int MAX_H_BLOCK = 128; + +__host__ void flexible_launch_configs( + const int reduction, + const int stride, + dim3 &block, + dim3 &grid, + const bool coop_flag = false) { + int block_x = std::min(lastPow2(stride), OPTIMAL_TILE_W); + int block_y = std::min(lastPow2(at::ceil_div(reduction , ELEMENTS_PER_THREAD)), + MAX_BLOCK_SIZE / block_x); + if (block_x * block_y != MAX_BLOCK_SIZE) { + block_x = std::min(lastPow2(stride), MAX_BLOCK_SIZE / block_y); + } + + int grid_x = at::ceil_div(stride, block_x); + int grid_y = std::min(at::ceil_div(reduction, block_y * ELEMENTS_PER_THREAD), MAX_H_BLOCK); + if (coop_flag) { + // it's not worth having a grid reduction if the reduction dimension is not big enough + grid_y = grid_y < 8 ? 1 : grid_y; + } + + block.x = block_x; + block.y = block_y; + block.z = 1; + grid.x = grid_x; + grid.y = grid_y; + grid.z = 1; +} + +template +__device__ __forceinline__ void welford_merge_element(C& count, + T& mean, + T& m2n, + const C& count_new, + const T& mean_new, + const T& m2n_new) { + T factor = T(1.0) / ::max(1, (count + count_new)); + T delta0 = mean - mean_new; + mean = (mean_new * count_new + mean * count) * factor; + m2n += m2n_new + delta0 * delta0 * count_new * count * factor; + count += count_new; +} + +// merge mean/m2n among threadIdx.y within block +template +__device__ __forceinline__ void welford_merge_block_vertical(C& count, + T& mean, + T& m2n, + C* shmem_count, + T* shmem_mean, + T* shmem_m2n) { + // write to shared memory + auto address_base = threadIdx.x + threadIdx.y * blockDim.x; + +#pragma unroll + for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset*2) { + shmem_mean[address_base] = mean; + shmem_m2n[address_base] = m2n; + shmem_count[address_base] = count; + } + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + auto address = address_base + offset * blockDim.x; + // read shared memory back to register for reduction + auto count_new = shmem_count[address]; + auto mean_new = shmem_mean[address]; + auto m2n_new = shmem_m2n[address]; + + welford_merge_element(count, mean, m2n, count_new, mean_new, m2n_new); + } + } +} + +template +__global__ void batch_norm_transform_input_kernel( + const GenericPackedTensorAccessor input, + GenericPackedTensorAccessor output, + const GenericPackedTensorAccessor, 1, RestrictPtrTraits, index_t> mean_, + const GenericPackedTensorAccessor, 1, RestrictPtrTraits, index_t> var_or_invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor bias, + stat_accscalar_t epsilon) { + + index_t plane = blockIdx.x; + + if (plane >= input.size(1)) { + return; + } + + stat_accscalar_t gamma = weight.size(0) > 0 ? static_cast(weight[plane]) : static_cast(1); + stat_accscalar_t beta = bias.size(0) > 0 ? static_cast(bias[plane]) : static_cast(0); + stat_accscalar_t mean = static_cast(mean_[plane]); + stat_accscalar_t invstd; + if (train) { + invstd = var_or_invstd[plane]; + } else { + invstd = static_cast(1) / device_sqrt(static_cast(var_or_invstd[plane]) + epsilon); + } + + index_t bs = input.size(0); + index_t fs = input.size(2); + + index_t bstep = blockDim.y * gridDim.y; + for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) { + auto o = output[batch][plane]; + auto i = input[batch][plane]; + for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) { + o[feature] = static_cast(gamma * (i[feature] - mean) * invstd + beta); + } + } +} + +struct InvStd { + template + __device__ __forceinline__ T operator()(T var, double epsilon) const { + T invstd = 0; + if (var != static_cast(0) || epsilon != static_cast(0)) { + invstd = static_cast(1) / device_sqrt(var + epsilon); + } + return invstd; + } +}; + +struct Var { + template + __device__ __forceinline__ T operator()(T var, double epsilon) const { + return var; + } +}; + +template +__global__ void batch_norm_collect_statistics_kernel( + const GenericPackedTensorAccessor input, + const stat_accscalar_t epsilon, + const stat_accscalar_t momentum, + GenericPackedTensorAccessor save_mean, + GenericPackedTensorAccessor save_transformed_var) { + + __shared__ int shared_n[2 * 2 * C10_WARP_SIZE + C10_WARP_SIZE]; + + int plane = blockIdx.x; + int N = input.size(0) * input.size(2); + int tid = threadIdx.x + threadIdx.y * blockDim.x; + + // Compute the mean and variance across (batch, x/y/z) + // this uses the Welford (in the for loop)/parallel algorithm (to sum across the block) + // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_Online_algorithm + // and the parallel algorithm on the same page. + // We use two shuffles to reduce across the entire block. + // https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ has a description. + stat_accscalar_t* shared_avg_var = (stat_accscalar_t*) &shared_n[C10_WARP_SIZE]; + + // first the reductions each thread does separately + stat_accscalar_t avg = 0; + stat_accscalar_t var_n = 0; + int n = 0; + for (int batch = threadIdx.y; batch < input.size(0); batch += blockDim.y) { +#if defined(USE_ROCM) + constexpr int UNRL = 4; + stat_accscalar_t v_[UNRL]; + for (int x = threadIdx.x; x < input.size(2); x += blockDim.x*UNRL) { + for (int u = 0; u < UNRL; u++) + v_[u] = input[batch][plane][std::min(x+u*blockDim.x, input.size(2)-1)]; + for (int u = 0; u < UNRL; u++) { + if (x+u*blockDim.x < input.size(2)) { + stat_accscalar_t d1 = v_[u] - avg; + n++; + avg += d1 / n; + var_n += d1 * (v_[u] - avg); + } + } + } +#else + for (int x = threadIdx.x; x < input.size(2); x += blockDim.x) { + stat_accscalar_t v = input[batch][plane][x]; + stat_accscalar_t d1 = v - avg; + n++; + avg += d1 / n; + var_n += d1 * (v - avg); + } +#endif + } + + // first warpSum to get one value per thread to + // one value per warp + for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) { + stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE); + int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE); + stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n); + var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor; + avg = (n * avg + o_n * o_avg) * factor; + n += o_n; + } + + // this writes each warps item into shared memory + // there are at most C10_WARP_SIZE items left because + // there are at most C10_WARP_SIZE**2 threads at the beginning + __syncthreads(); + if (tid % C10_WARP_SIZE == 0) { + shared_n[tid / C10_WARP_SIZE] = n; + shared_avg_var[tid / C10_WARP_SIZE * 2] = avg; + shared_avg_var[tid / C10_WARP_SIZE * 2 + 1] = var_n; + } + __syncthreads(); + // now have a second warpSum to reduce the intermediate values + // from shared memory to a single number. The very first + // thread writes it to shared memory. + + if (tid < C10_WARP_SIZE) { + n = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_n[tid] : 0); + avg = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_avg_var[2 * tid] : stat_accscalar_t(0)); + var_n = (tid < blockDim.x * blockDim.y / C10_WARP_SIZE ? shared_avg_var[2 * tid + 1] : stat_accscalar_t(0)); + } + for (int i = 0; i < getMSB(C10_WARP_SIZE); ++i) { + stat_accscalar_t o_avg = WARP_SHFL_XOR(avg, 1 << i, C10_WARP_SIZE); + int o_n = WARP_SHFL_XOR(n, 1 << i, C10_WARP_SIZE); + stat_accscalar_t factor = 1.0 / fmaxf(1.0, n+o_n); + var_n += WARP_SHFL_XOR(var_n, 1 << i, C10_WARP_SIZE) + (avg - o_avg) * (avg - o_avg) * n * o_n * factor; + avg = (n * avg + o_n * o_avg) * factor; + n += o_n; + } + + // Save the mean, variance, and moving averages + if (tid == 0) { + if (save_mean.data() != NULL) { + save_mean[plane] = avg; + } + if (save_transformed_var.data() != NULL) { + save_transformed_var[plane] = VarTransform{}(var_n / N, epsilon); + } + } + +} + +template +__global__ void batch_norm_backward_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + GenericPackedTensorAccessor grad_input, + GenericPackedTensorAccessor grad_weight, + GenericPackedTensorAccessor grad_bias, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor running_mean, + const GenericPackedTensorAccessor running_var, + const GenericPackedTensorAccessor save_mean, + const GenericPackedTensorAccessor save_invstd, + bool train, + stat_accscalar_t epsilon) { + + index_t plane = blockIdx.x; + index_t N = grad_output.size(0) * grad_output.size(2); + + stat_accscalar_t mean, invstd; + if (train) { + mean = save_mean[plane]; + invstd = save_invstd[plane]; + } else { + mean = static_cast(running_mean[plane]); + invstd = static_cast(1) / device_sqrt(static_cast(running_var[plane]) + epsilon); + } + + stat_accscalar_t weight_val = weight.size(0) > 0 ? static_cast(weight[plane]) : stat_accscalar_t(1); + stat_accscalar_t norm = stat_accscalar_t(1) / N; + + // Compute two values across (batch, x/y/z) in one pass: + // 1. Sum(grad_output) + // 2. DotProduct(input - mean, grad_output) + GradOp> g(mean, input, grad_output); + auto res = reduce>(g, grad_output, plane); + + stat_accscalar_t grad_output_sum = res.v1; + stat_accscalar_t dot_p = res.v2; + + stat_accscalar_t grad_mean = grad_output_sum * norm; + stat_accscalar_t proj_scale = dot_p * norm * invstd * invstd; + stat_accscalar_t grad_scale = invstd * weight_val; + + if (grad_input.data() != NULL) { + for (int batch = threadIdx.y; batch < grad_output.size(0); batch += blockDim.y) { + for (int x = threadIdx.x; x < grad_output.size(2); x += blockDim.x) { + input_scalar_t go = grad_output[batch][plane][x]; + if (train) { + stat_accscalar_t inp = input[batch][plane][x]; + stat_accscalar_t proj = (inp - mean) * proj_scale; + grad_input[batch][plane][x] = static_cast((go - proj - grad_mean) * grad_scale); + } else { + grad_input[batch][plane][x] = static_cast(go * grad_scale); + } + } + } + } + + if (grad_weight.size(0) > 0) { + if (threadIdx.x == 0) { + grad_weight[plane] = static_cast(dot_p * invstd); + } + } + + if (grad_bias.size(0) > 0) { + if (threadIdx.x == 0) { + grad_bias[plane] = static_cast(grad_output_sum); + } + } +} + +template +__global__ void batch_norm_reduce_statistics_kernel( + const GenericPackedTensorAccessor vec_mean, + const GenericPackedTensorAccessor vec_invstd, + GenericPackedTensorAccessor mean, + GenericPackedTensorAccessor invstd, + GenericPackedTensorAccessor running_mean, + GenericPackedTensorAccessor running_var, + const accscalar_t epsilon, + const accscalar_t momentum, + const GenericPackedTensorAccessor counts) { + + int feature_size = vec_mean.size(1); + int world_size = vec_mean.size(0); + + int bid = blockIdx.x; + int tid = threadIdx.x; + + // first the reductions each thread does separately + for (int i = bid*blockDim.x+tid; i < feature_size; i += gridDim.x*blockDim.x) { + accscalar_t avg = 0; + accscalar_t var_n = 0; + index_t n = 0; + for (int j = 0; j < world_size; j++) { + scalar_t count = counts[j]; + accscalar_t m = vec_mean[j][i]; + accscalar_t v = accscalar_t(1.0) / (vec_invstd[j][i]); + v = (v * v - epsilon) * count; + accscalar_t factor = 1.0 / (n + count); + var_n += v + (avg - m) * (avg - m) * n * count * factor; + avg = n * factor * avg + count * factor * m; + n += count; + } + mean[i] = avg; + invstd[i] = static_cast(1) / device_sqrt(var_n / n + epsilon); + if (running_mean.data() != NULL) { + running_mean[i] = static_cast((1 - momentum) * running_mean[i] + momentum * avg); + } + accscalar_t unbiasedVar = var_n / (n - 1); + if (running_var.data() != NULL) { + running_var[i] = static_cast((1 - momentum) * running_var[i] + momentum * unbiasedVar); + } + } + +} + +template +__global__ void batch_norm_backward_reduce_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + GenericPackedTensorAccessor mean, + GenericPackedTensorAccessor invstd, + GenericPackedTensorAccessor sum_dy, + GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_weight, + GenericPackedTensorAccessor grad_bias) { + + index_t plane = blockIdx.x; + + stat_accscalar_t r_mean = mean[plane]; + stat_accscalar_t factor = invstd[plane]; + + GradOp> g(r_mean, input, grad_output); + auto res = reduce>(g, grad_output, plane); + + if (threadIdx.x == 0) { + if (grad_weight.size(0) > 0) { + grad_weight[plane] = static_cast(res.v2 * factor); + } + if (grad_bias.size(0) > 0) { + grad_bias[plane] = static_cast(res.v1); + } + if (sum_dy.size(0) > 0) { + sum_dy[plane] = static_cast(res.v1); + } + if (sum_dy_xmu.size(0) > 0) { + sum_dy_xmu[plane] = static_cast(res.v2); + } + } +} + +template +__device__ __forceinline__ void batch_norm_backward_elemt_kernel_impl( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor mean, + const GenericPackedTensorAccessor invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor sum_dy, + const GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_input, + const stat_accscalar_t norm_fct) { + index_t plane = blockIdx.x; + + if (plane >= input.size(1)) { + return; + } + + stat_accscalar_t m_c = mean[plane]; + stat_accscalar_t m_dy_c = sum_dy[plane] * norm_fct; + stat_accscalar_t factor_1_c = invstd[plane]; + stat_accscalar_t factor_2_c = weight.size(0) > 0 ? static_cast(weight[plane]) : stat_accscalar_t(1); + factor_2_c *= factor_1_c; + factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[plane] * norm_fct; + + index_t bs = input.size(0); + index_t fs = input.size(2); + + index_t bstep = blockDim.y * gridDim.y; + for (index_t batch = threadIdx.y + blockIdx.y * blockDim.y; batch < bs; batch += bstep) { + auto g_i = grad_input[batch][plane]; + auto g_o = grad_output[batch][plane]; + auto i = input[batch][plane]; + for (index_t feature = threadIdx.x; feature < fs; feature += blockDim.x) { + g_i[feature] = static_cast((g_o[feature] - m_dy_c - (i[feature] - m_c) * factor_1_c) * factor_2_c); + } + } +} + +template +__global__ void batch_norm_backward_elemt_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor mean, + const GenericPackedTensorAccessor invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor sum_dy, + const GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_input, + const int* __restrict__ numel, const int world_size) { + int64_t total_numel = 0; + for (int i = 0; i < world_size; i ++) { + total_numel += numel[i]; + } + + const stat_accscalar_t norm_fct = + static_cast(1) / static_cast(total_numel); + batch_norm_backward_elemt_kernel_impl( + input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct); +} + +template +__global__ void batch_norm_backward_elemt_kernel( + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor mean, + const GenericPackedTensorAccessor invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor sum_dy, + const GenericPackedTensorAccessor sum_dy_xmu, + GenericPackedTensorAccessor grad_input, + const stat_accscalar_t norm_fct) { + batch_norm_backward_elemt_kernel_impl( + input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct); +} + +template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> +static GenericPackedTensorAccessor get_packed_accessor( + const Tensor& t, std::string_view var_name) { + constexpr auto expect_type = c10::CppTypeToScalarType>::value; + const auto actual_type = t.scalar_type(); + TORCH_CHECK(actual_type == expect_type, "Expected ", var_name, + " to have type ", expect_type, " but got ", actual_type); + return t.generic_packed_accessor(); +} + +template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> +static GenericPackedTensorAccessor packed_accessor_or_dummy( + const Tensor& t, std::string_view var_name) { + if (!t.defined()) { + const std::array zeros{{0}}; + return GenericPackedTensorAccessor(nullptr, zeros.data(), zeros.data()); + } + return get_packed_accessor(t, var_name); +} + +template +std::tuple batch_norm_backward_cuda_template(const Tensor& grad_out_, const Tensor& input_, const Tensor& weight_, + const Tensor& running_mean_, const Tensor& running_var_, const Tensor& save_mean_, const Tensor& save_invstd_, + bool train, double epsilon, std::array grad_input_mask) { + + using accscalar_t = at::acc_type; + Tensor grad_input_; + Tensor grad_input_reshaped; + Tensor grad_weight_; + Tensor grad_bias_; + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + + if (grad_input_mask[0]) { + grad_input_ = at::empty_like(input_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + grad_input_reshaped = grad_input_.view(input_reshaped.sizes()); + } + if (grad_input_mask[1]) { + grad_weight_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (grad_input_mask[2]) { + grad_bias_ = at::empty_like(weight_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + + auto input = get_packed_accessor< + const input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_output = get_packed_accessor< + const input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto grad_input = packed_accessor_or_dummy< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); + auto weight = packed_accessor_or_dummy< + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + auto grad_weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight"); + auto grad_bias = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias"); + auto running_mean = packed_accessor_or_dummy< + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean"); + auto running_var = packed_accessor_or_dummy< + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var"); + auto save_mean = packed_accessor_or_dummy< + const accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean"); + auto save_invstd = packed_accessor_or_dummy< + const accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd"); + + auto stream = at::cuda::getCurrentCUDAStream(); + dim3 blocks(input.size(1)); + int tf = getNumThreads(input.size(2)); + dim3 threads(tf, std::max(1, MAX_BLOCK_SIZE/tf)); + + batch_norm_backward_kernel <<>> + (input, grad_output, grad_input, grad_weight, grad_bias, weight, running_mean, running_var, + save_mean, save_invstd, train, epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return std::make_tuple(grad_input_, grad_weight_, grad_bias_); +} + +template +void batch_norm_stats_cuda_template( + const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input_, double epsilon) { + + using accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + Tensor dummy_mean_; + Tensor dummy_var_; + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + + resize_output(out_mean, {n_input}); + resize_output(out_invstd, {n_input}); + auto input = get_packed_accessor< + const scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); + TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() && + out_invstd.sizes()[0]); + TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() && + out_mean.sizes()[0]); + + auto mean = packed_accessor_or_dummy< + accscalar_t, 1, RestrictPtrTraits, index_t>(out_mean, "out_mean"); + auto invstd = packed_accessor_or_dummy< + accscalar_t, 1, RestrictPtrTraits, index_t>(out_invstd, "out_invstd"); + auto stream = at::cuda::getCurrentCUDAStream(); + + dim3 blocks(input.size(1)); + int tf = getNumThreads(input.size(2)); + dim3 threads(tf, std::max(1, MAX_BLOCK_SIZE/tf)); + batch_norm_collect_statistics_kernel <<>> + (input, epsilon, 0.0, mean, invstd); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void batch_norm_elemt_cuda_template(const Tensor& output_, const Tensor& input_, const Tensor& weight_, + const Tensor& bias_, const Tensor& mean_, const Tensor& invstd_) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto output_reshaped = output_.view({input_.size(0), input_.size(1), -1}); + + auto input = get_packed_accessor< + const input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); + auto output = get_packed_accessor< + input_scalar_t, 3, RestrictPtrTraits, index_t>(output_reshaped, "output"); + auto weight = packed_accessor_or_dummy< + const stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight"); + auto bias = packed_accessor_or_dummy< + const stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, RestrictPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, RestrictPtrTraits, index_t>(invstd_, "invstd"); + auto stream = at::cuda::getCurrentCUDAStream(); + + // NOTE: We use transform_input_kernel in training mode, which ignores epsilon + const double dummy_epsilon = 1e-5; + + // The input_transform kernel is pointwise, but we need to balance reading parameters (save_var/mean, + // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks + // and good occupancy. Quiet likely, we could go with even more blocks than 1024. + // The various planes are independent, so we use blocks for them. + int tf = std::max(getNumThreads(input.size(2)/4), + std::min(getNumThreads(input.size(2)), 64)); + int tb = std::max(64/tf, 1); + dim3 blocks_trans(input.size(1), std::max(1, std::min((256*1024)/input.size(1), + (input.size(0)+tb-1)/tb))); + blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE); + dim3 threads_trans(tf, tb); + batch_norm_transform_input_kernel <<>> + (input, output, mean, invstd, weight, bias, dummy_epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +std::tuple batch_norm_gather_stats_cuda_template(const Tensor& mean_, const Tensor& invstd_, + const Tensor& running_mean_, const Tensor& running_var_, + double momentum, double epsilon, const Tensor& counts_) { + + Tensor save_mean_; + Tensor save_invstd_; + + auto features = mean_.size(1); + auto input_options = mean_.options(); + if (mean_.scalar_type() == at::ScalarType::Half || mean_.scalar_type() == at::ScalarType::BFloat16) { + input_options = input_options.dtype(ScalarType::Float); + } + save_mean_ = at::empty({features}, input_options); + save_invstd_ = at::empty({features}, input_options); + + auto mean = packed_accessor_or_dummy< + accscalar_t, 2, RestrictPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + accscalar_t, 2, RestrictPtrTraits, index_t>(invstd_, "invstd"); + auto running_mean = packed_accessor_or_dummy< + scalar_t, 1, RestrictPtrTraits, index_t>(running_mean_, "running_mean"); + auto running_var = packed_accessor_or_dummy< + scalar_t, 1, RestrictPtrTraits, index_t>(running_var_, "running_mean"); + auto counts = packed_accessor_or_dummy< + scalar_t, 1, RestrictPtrTraits, index_t>(counts_, "counts"); + + auto save_mean = get_packed_accessor< + accscalar_t, 1, RestrictPtrTraits, index_t>(save_mean_, "save_mean"); + auto save_invstd = get_packed_accessor< + accscalar_t, 1, RestrictPtrTraits, index_t>(save_invstd_, "save_invstd"); + auto stream = at::cuda::getCurrentCUDAStream(); + + int block = getNumThreads(features); + int grid = std::max(1, features/block); + batch_norm_reduce_statistics_kernel <<>> + (mean, invstd, save_mean, save_invstd, running_mean, running_var, epsilon, momentum, counts); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return std::make_tuple(save_mean_, save_invstd_); +} + +template +std::tuple batch_norm_backward_reduce_cuda_template(const Tensor& grad_out_, const Tensor& input_, + const Tensor& mean_, const Tensor& invstd_, const Tensor& weight_, + const bool input_g, const bool weight_g, const bool bias_g) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + Tensor sum_dy_; + Tensor sum_dy_xmu_; + Tensor grad_weight_; + Tensor grad_bias_; + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + + if (input_g) { + sum_dy_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + sum_dy_xmu_ = at::empty_like(mean_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + } + if (weight_g) { + grad_weight_ = at::empty({n_input}, weight_.options()); + } + if (bias_g) { + grad_bias_ = at::empty({n_input}, weight_.options()); + } + + auto input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_output = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto grad_weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight"); + auto grad_bias = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd"); + auto sum_dy = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy"); + auto sum_dy_xmu = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu"); + + auto batch_size = input_reshaped.size(0); + auto feature_size = input_reshaped.size(2); + auto stream = at::cuda::getCurrentCUDAStream(); + + int warp_size = at::cuda::warp_size(); + int block_y = std::min(lastPow2(batch_size), MAX_BLOCK_SIZE/warp_size); + // We want block_x to be at least a warp width + int block_x = std::min(std::max(getNumThreads(feature_size), warp_size), MAX_BLOCK_SIZE/block_y); + const dim3 block(block_x, block_y); + const dim3 grid(n_input); + + batch_norm_backward_reduce_kernel <<>> + (input, grad_output, mean, invstd, sum_dy, sum_dy_xmu, grad_weight, grad_bias); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return std::make_tuple(sum_dy_, sum_dy_xmu_, grad_weight_, grad_bias_); +} + +template +Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Tensor& input_, + const Tensor& mean_, const Tensor& invstd_, + const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + + auto input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); + auto grad_output = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd"); + auto weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + auto sum_dy = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy"); + auto sum_dy_xmu = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu"); + + auto stream = at::cuda::getCurrentCUDAStream(); + + // The kernel is pointwise, but we need to balance reading parameters (save_var/mean, + // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks + // and good occupancy. Quiet likely, we could go with even more blocks than 1024. + // The various planes are independent, so we use blocks for them. + int tf = std::max(getNumThreads(input.size(2)/4), + std::min(getNumThreads(input.size(2)), 64)); + int tb = std::max(64/tf, 1); + dim3 blocks_trans(input.size(1), std::max(1, std::min((256*1024)/input.size(1), + (input.size(0)+tb-1)/tb))); + blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE); + dim3 threads_trans(tf, tb); + auto reduction_size = input_.numel() / n_input; + auto norm_fct = static_cast(1.0 / reduction_size); + batch_norm_backward_elemt_kernel + <<>> + (input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, norm_fct); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return grad_input_reshaped.view(input_.sizes()); +} + +template +Tensor batch_norm_backward_elemt_cuda_template(const Tensor& grad_out_, const Tensor& input_, + const Tensor& mean_, const Tensor& invstd_, + const Tensor& weight_, const Tensor& sum_dy_, const Tensor& sum_dy_xmu_, const Tensor& count) { + + using stat_accscalar_t = at::acc_type; + int64_t n_input = input_.size(1); + auto input_reshaped = input_.reshape({input_.size(0), input_.size(1), -1}); // internally we merge the feature dimensions + auto grad_output_reshaped = grad_out_.reshape(input_reshaped.sizes()); + auto grad_input_reshaped = at::empty_like(input_reshaped, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + + auto input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + auto grad_input = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); + auto grad_output = get_packed_accessor< + input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + auto mean = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(mean_, "mean"); + auto invstd = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(invstd_, "invstd"); + auto weight = packed_accessor_or_dummy< + stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + auto sum_dy = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_, "sum_dy"); + auto sum_dy_xmu = packed_accessor_or_dummy< + stat_accscalar_t, 1, DefaultPtrTraits, index_t>(sum_dy_xmu_, "sum_dy_xmu"); + + auto stream = at::cuda::getCurrentCUDAStream(); + + // The kernel is pointwise, but we need to balance reading parameters (save_var/mean, + // weight/bias) - which we only do once and have a for loop afterwards - with having many threads and blocks + // and good occupancy. Quiet likely, we could go with even more blocks than 1024. + // The various planes are independent, so we use blocks for them. + int tf = std::max(getNumThreads(input.size(2)/4), + std::min(getNumThreads(input.size(2)), 64)); + int tb = std::max(64/tf, 1); + dim3 blocks_trans(input.size(1), std::max(1, std::min((256*1024)/input.size(1), + (input.size(0)+tb-1)/tb))); + blocks_trans.y = std::min(blocks_trans.y, MAX_GRID_SIZE); + dim3 threads_trans(tf, tb); + batch_norm_backward_elemt_kernel <<>> + (input, grad_output, mean, invstd, weight, sum_dy, sum_dy_xmu, grad_input, count.const_data_ptr(), count.numel()); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + return grad_input_reshaped.view(input_.sizes()); +} + +// welford kernel for c last tensor calculating mean/biased_variance/unbiased_variance +// original apex name: welford_kernel_c_last +template + +__global__ void +batch_norm_collect_statistics_channels_last_kernel( + const scalar_t* __restrict__ input, + accscalar_t* __restrict__ out_mean, + accscalar_t* __restrict__ out_invstd, + volatile accscalar_t* staging_data, + int* semaphores, + const int reduction_size, + const int stride, + accscalar_t epsilon) { + // hide latency with concurrency + accscalar_t x_mean[PARALLEL_LOADS]; + accscalar_t m_2_n[PARALLEL_LOADS]; + int count[PARALLEL_LOADS]; + +#pragma unroll + for (int i = 0; i < PARALLEL_LOADS; i++) { + x_mean[i] = accscalar_t(0); + m_2_n[i] = accscalar_t(0); + count[i] = accscalar_t(0); + } + // tensor dimension (m,c) + + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + for (int i = 0; i < loop_count; i++) { + accscalar_t x_math[PARALLEL_LOADS]; + accscalar_t x_count_inv[PARALLEL_LOADS]; + accscalar_t is_valid[PARALLEL_LOADS]; + + // load multiple data in +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + x_math[j] = input[address_base]; + count[j]++; + x_count_inv[j] = accscalar_t(1) / count[j]; + is_valid[j] = accscalar_t(1); + } else { + x_math[j] = accscalar_t(0); + x_count_inv[j] = accscalar_t(0); + is_valid[j] = accscalar_t(0); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + + // calculate mean/m2n with welford +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + accscalar_t delta0 = x_math[j] - x_mean[j]; + x_mean[j] += delta0 * x_count_inv[j]; + accscalar_t delta1 = x_math[j] - x_mean[j]; + m_2_n[j] += delta0 * delta1 * is_valid[j]; + } + } + + // thread reduction to accumulate mean/m_2_n/count between PARALLEL_LOADS +#pragma unroll + for (int j = 1; j < PARALLEL_LOADS; j++) { + welford_merge_element(count[0], x_mean[0], m_2_n[0], count[j], x_mean[j], m_2_n[j]); + } + + // release x_mean / m_2_n + auto mean_th = x_mean[0]; + auto m2_th = m_2_n[0]; + auto count_th = count[0]; + + // block-wise reduction with shared memory (since reduction cannot be done within a warp) + static __shared__ accscalar_t shmem_mean[MAX_BLOCK_SIZE]; + static __shared__ accscalar_t shmem_m2n[MAX_BLOCK_SIZE]; + static __shared__ int shmem_count[MAX_BLOCK_SIZE]; + + welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n); + + if (gridDim.y > 1) { + volatile accscalar_t* staging_mean = staging_data; + volatile accscalar_t* staging_m2n = &staging_data[stride*gridDim.y]; + volatile int* staging_count = reinterpret_cast(&staging_m2n[stride*gridDim.y]); + + address_base = c_offset + blockIdx.y * stride; + // write data to staging_data; + if (threadIdx.y == 0 && c_offset < stride) { + staging_mean[address_base] = mean_th; + staging_m2n[address_base] = m2_th; + staging_count[address_base] = count_th; + } + + __threadfence(); + __syncthreads(); // ensuring writes to staging_ is visible to all blocks + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done = (old == (gridDim.y-1)); + } + + __syncthreads(); + + // check that all data is now available in global memory + if (is_last_block_done) { + count_th = 0; + mean_th = accscalar_t(0.0); + m2_th = accscalar_t(0.0); + + for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { + address_base = c_offset + y * stride; + int count_new = c_offset < stride ? staging_count[address_base] : 0; + accscalar_t mean_new = c_offset < stride ? staging_mean[address_base] : accscalar_t(0.0); + accscalar_t m2n_new = c_offset < stride ? staging_m2n[address_base] : accscalar_t(0.0); + + welford_merge_element(count_th, mean_th, m2_th, count_new, mean_new, m2n_new); + } + + welford_merge_block_vertical(count_th, mean_th, m2_th, shmem_count, shmem_mean, shmem_m2n); + if (threadIdx.y == 0 && c_offset < stride) { + out_mean[c_offset] = static_cast(mean_th); + out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon); + } + } + } else { + if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) { + out_mean[c_offset] = static_cast(mean_th); + out_invstd[c_offset] = VarTransform{}(m2_th/count_th, epsilon); + } + } +} + +// elementwise BN kernel +// original apex name: batchnorm_forward_c_last_kernel +template < + typename scalar_t, + typename accscalar_t, + typename layerscalar_t, + int PARALLEL_LOADS> +__global__ void batch_norm_transform_input_channels_last_kernel( + const scalar_t* __restrict__ input, + const scalar_t* __restrict__ z, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const layerscalar_t* __restrict__ shift, + scalar_t* __restrict__ out, + const int reduction_size, + const int stride, + const bool fuse_relu) { + // tensor dimension (m,c) + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_offset >= stride || m_offset >= reduction_size) { + return; + } + + auto m_c = mean[c_offset]; + auto inv_std_c = static_cast(inv_std[c_offset]); + auto w_c = weight == nullptr ? accscalar_t(1.0) : static_cast(weight[c_offset]); + auto s_c = shift == nullptr ? accscalar_t(0.0) : static_cast(shift[c_offset]); + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + for (int i = 0; i < loop_count; i++) { +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + auto tmp = w_c * (static_cast(input[address_base]) - m_c ) * inv_std_c + s_c; + if (z != nullptr) { + tmp += z[address_base]; + } + out[address_base] = (fuse_relu && tmp <= accscalar_t(0.0) ? scalar_t(0.0) : static_cast(tmp)); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + } +} + +template +__device__ __forceinline__ void merge_block_vertical_backward(T& sum_dy, + T& sum_dy_xmu, + T* shmem_sum_dy, + T* shmem_sum_dy_xmu) { + // write to shared memory + auto address_base = threadIdx.x + threadIdx.y * blockDim.x; + +#pragma unroll + for (int offset = blockDim.y/2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset*2) { + shmem_sum_dy[address_base] = sum_dy; + shmem_sum_dy_xmu[address_base] = sum_dy_xmu; + } + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + auto address = address_base + offset * blockDim.x; + + sum_dy += shmem_sum_dy[address]; + sum_dy_xmu += shmem_sum_dy_xmu[address]; + } + } +} + +// batchnorm backward kernel for c last tensor +// original apex name: reduce_bn_c_last_kernel +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__global__ void batch_norm_backward_reduce_channels_last_kernel( + const scalar_t* __restrict__ input, + const scalar_t* __restrict__ grad_output, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + accscalar_t* __restrict__ sum_dy_o, + accscalar_t* __restrict__ sum_dy_xmu_o, + layerscalar_t* __restrict__ grad_weight, + layerscalar_t* __restrict__ grad_bias, + volatile accscalar_t* staging_data, + int* semaphores, + const int reduction_size, + const int stride) { + + // hide latency with concurrency + accscalar_t sum_dy[PARALLEL_LOADS]; + accscalar_t sum_dy_xmu[PARALLEL_LOADS]; + +#pragma unroll + for (int i = 0; i < PARALLEL_LOADS; i++) { + sum_dy[i] = accscalar_t(0); + sum_dy_xmu[i] = accscalar_t(0); + } + // tensor dimension (m,c) + + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_offset >= stride || m_offset >= reduction_size) { + return; + } + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + auto r_mean = mean[c_offset]; + auto factor = inv_std[c_offset]; + + for (int i = 0; i < loop_count; i++) { + accscalar_t x_input[PARALLEL_LOADS]; + accscalar_t x_grad_output[PARALLEL_LOADS]; + + // load multiple data in +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + x_input[j] = input[address_base]; + x_grad_output[j] = grad_output[address_base]; + } else { + x_input[j] = accscalar_t(0); + x_grad_output[j] = accscalar_t(0); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + + // calculate sum_dy / sum_dy_xmu +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + sum_dy[j] += x_grad_output[j]; + sum_dy_xmu[j] += x_grad_output[j] * (x_input[j] - r_mean); + } + } + + // thread reduction to accumulate sum_dy / sum_dy_xmu between PARALLEL_LOADS +#pragma unroll + for (int j = 1; j < PARALLEL_LOADS; j++) { + sum_dy[0] += sum_dy[j]; + sum_dy_xmu[0] += sum_dy_xmu[j]; + } + + // release array of registers + auto sum_dy_th = sum_dy[0]; + auto sum_dy_xmu_th = sum_dy_xmu[0]; + + // block-wise reduction with shared memory (since reduction cannot be done within a warp) + static __shared__ accscalar_t shmem_sum_dy[MAX_BLOCK_SIZE]; + static __shared__ accscalar_t shmem_sum_dy_xmu[MAX_BLOCK_SIZE]; + + merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu); + + if (gridDim.y > 1) { + volatile accscalar_t* staging_sum_dy = staging_data; + volatile accscalar_t* staging_sum_dy_xmu = &staging_data[stride*gridDim.y]; + + address_base = c_offset + blockIdx.y * stride; + // write data to staging_data; + if (threadIdx.y == 0 && c_offset < stride) { + staging_sum_dy[address_base] = sum_dy_th; + staging_sum_dy_xmu[address_base] = sum_dy_xmu_th; + } + + __threadfence(); + __syncthreads(); // ensuring writes to staging_ is visible to all blocks + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done = (old == (gridDim.y-1)); + } + + __syncthreads(); + + // check that all data is now available in global memory + if (is_last_block_done) { + sum_dy_th = accscalar_t(0.0); + sum_dy_xmu_th = accscalar_t(0.0); + + for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { + address_base = c_offset + y * stride; + sum_dy_th += (c_offset < stride ? staging_sum_dy[address_base] : accscalar_t(0.0)); + sum_dy_xmu_th += (c_offset < stride ? staging_sum_dy_xmu[address_base] : accscalar_t(0.0)); + } + + merge_block_vertical_backward(sum_dy_th, sum_dy_xmu_th, shmem_sum_dy, shmem_sum_dy_xmu); + if (threadIdx.y == 0 && c_offset < stride) { + if (grad_bias != nullptr) { + grad_bias[c_offset] = static_cast(sum_dy_th); + } + if (grad_weight != nullptr) { + grad_weight[c_offset] = static_cast(sum_dy_xmu_th * factor); + } + //mean_dy[c_offset] = sum_dy_th / reduction_size; + //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size; + sum_dy_o[c_offset] = sum_dy_th; + sum_dy_xmu_o[c_offset] = sum_dy_xmu_th; + } + } + } else { + if (blockIdx.y == 0 && threadIdx.y == 0 && c_offset < stride) { + if (grad_bias != nullptr) { + grad_bias[c_offset] = static_cast(sum_dy_th); + } + if (grad_weight != nullptr) { + grad_weight[c_offset] = static_cast(sum_dy_xmu_th * factor); + } + //mean_dy[c_offset] = sum_dy_th / reduction_size; + //mean_dy_xmu[c_offset] = sum_dy_xmu_th / reduction_size; + sum_dy_o[c_offset] = sum_dy_th; + sum_dy_xmu_o[c_offset] = sum_dy_xmu_th; + } + } +} + +// elementwise BN kernel +// original apex name: batchnorm_backward_c_last_kernel +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__device__ __forceinline__ void batch_norm_backward_elemt_channels_last_kernel_impl( + const scalar_t* __restrict__ grad_output, + const scalar_t* __restrict__ input, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const accscalar_t* __restrict__ sum_dy, + const accscalar_t* __restrict__ sum_dy_xmu, + scalar_t* __restrict__ grad_input, + const accscalar_t norm_fct, + const int reduction_size, + const int stride) { + // tensor dimension (m,c) + // loop along m dimension + int inner_loop_stride = blockDim.y * gridDim.y; + + // offset along m dimension + int m_offset = blockIdx.y * blockDim.y + threadIdx.y; + int c_offset = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_offset >= stride || m_offset >= reduction_size) { + return; + } + + auto m_c = mean[c_offset]; + auto m_dy_c = sum_dy[c_offset] * norm_fct; + auto factor_1_c = inv_std[c_offset]; + auto factor_2_c = (weight == nullptr? accscalar_t(1.0) : static_cast(weight[c_offset])) * factor_1_c; + factor_1_c = factor_1_c * factor_1_c * sum_dy_xmu[c_offset] * norm_fct; + + int loop_count = 1 + (reduction_size - 1) / (inner_loop_stride * PARALLEL_LOADS); + int address_base = m_offset * stride + c_offset; + int address_increment = inner_loop_stride * stride; + + for (int i = 0; i < loop_count; i++) { +#pragma unroll + for (int j = 0; j < PARALLEL_LOADS; j++) { + if (c_offset < stride && m_offset < reduction_size) { + grad_input[address_base] = static_cast( + (static_cast(grad_output[address_base]) - m_dy_c - + (static_cast(input[address_base]) - m_c) * factor_1_c) + * factor_2_c); + } + m_offset += inner_loop_stride; + address_base += address_increment; + } + } +} + +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__global__ void batch_norm_backward_elemt_channels_last_kernel( + const scalar_t* __restrict__ grad_output, + const scalar_t* __restrict__ input, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const accscalar_t* __restrict__ sum_dy, + const accscalar_t* __restrict__ sum_dy_xmu, + const int* __restrict__ numel, + scalar_t* __restrict__ grad_input, + const int64_t world_size, + const int reduction_size, + const int stride) { + + int64_t total_numel = 0; + for (int i = 0; i < world_size; i++) { + total_numel += numel[i]; + } + + auto norm_fct = static_cast(1) / static_cast(total_numel); + batch_norm_backward_elemt_channels_last_kernel_impl( + grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu, + grad_input, norm_fct, reduction_size, stride); +} + +template < + int PARALLEL_LOADS, + typename scalar_t, + typename accscalar_t, + typename layerscalar_t> +__global__ void batch_norm_backward_elemt_channels_last_kernel( + const scalar_t* __restrict__ grad_output, + const scalar_t* __restrict__ input, + const accscalar_t* __restrict__ mean, + const accscalar_t* __restrict__ inv_std, + const layerscalar_t* __restrict__ weight, + const accscalar_t* __restrict__ sum_dy, + const accscalar_t* __restrict__ sum_dy_xmu, + scalar_t* __restrict__ grad_input, + const accscalar_t norm_fct, + const int reduction_size, + const int stride) { + batch_norm_backward_elemt_channels_last_kernel_impl( + grad_output, input, mean, inv_std, weight, sum_dy, sum_dy_xmu, + grad_input, norm_fct, reduction_size, stride); +} + +template +void batch_norm_stats_channels_last_cuda_template( + const Tensor& out_mean, const Tensor& out_invstd, const Tensor& input, double epsilon) { + using accscalar_t = at::acc_type; + + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + resize_output(out_mean, {stride}); + resize_output(out_invstd, {stride}); + TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() && + out_invstd.sizes()[0]); + TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() && + out_mean.sizes()[0]); + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid, true); + + at::Tensor staging_data; + at::Tensor semaphores; + if (grid.y > 1) { + staging_data = at::empty({4*stride*grid.y}, out_mean.options()); + semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt)); + } + + accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr() : nullptr; + int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr() : nullptr; + batch_norm_collect_statistics_channels_last_kernel + <<>>( + input.const_data_ptr(), + out_mean.mutable_data_ptr(), + out_invstd.mutable_data_ptr(), + staging_data_ptr, + semaphores_ptr, + reduction_size, + stride, + epsilon); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +void batch_norm_elemt_channels_last_cuda_template( + const at::Tensor& output, + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& shift, // bias of BN + const at::Tensor& mean, + const at::Tensor& inv_std, + const std::optional& z = std::nullopt, // bias after BN + const bool fuse_relu = false) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid); + + auto stream = at::cuda::getCurrentCUDAStream(); + const auto second_dtype = weight.defined() ? weight.scalar_type() : + (shift.defined() ? shift.scalar_type() : input.scalar_type()); + + if (input.scalar_type() != second_dtype) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] { + using accscalar_t = at::acc_type; + batch_norm_transform_input_channels_last_kernel + <<>>( + input.const_data_ptr(), + z.has_value() ? z.value().const_data_ptr() : nullptr, + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + shift.defined() ? shift.const_data_ptr() : nullptr, + output.mutable_data_ptr(), + reduction_size, + stride, + fuse_relu); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } else { + if (weight.defined()){ + TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_forward: input.scalar_type() ", input.scalar_type(), + " is not supported with weight.scalar_type() ", weight.scalar_type()); + } + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_forward", [&] { + using accscalar_t = at::acc_type; + batch_norm_transform_input_channels_last_kernel + <<>>( + input.const_data_ptr(), + z.has_value() ? z.value().const_data_ptr() : nullptr, + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + shift.defined() ? shift.const_data_ptr(): nullptr, + output.mutable_data_ptr(), + reduction_size, + stride, + fuse_relu); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } +} + +std::tuple +batch_norm_backward_reduce_cuda_channels_last_template(const at::Tensor& grad_output, + const at::Tensor& input, + const at::Tensor& mean, + const at::Tensor& inv_std, + const at::Tensor& weight, + const bool input_g, const bool weight_g, const bool bias_g) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + at::Tensor sumn_dy = at::empty({stride}, mean.options()); + at::Tensor sum_dy_xmu = at::empty({stride}, mean.options()); + + at::Tensor grad_weight; + at::Tensor grad_bias; + if (weight.defined()) { + grad_weight = at::empty({stride}, weight.options()); + grad_bias = at::empty({stride}, weight.options()); + } else { + // because I cannot return an uninitialized at::Tensor + grad_weight = at::empty({0}, mean.options()); + grad_bias = at::empty({0}, mean.options()); + } + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid, true); + + at::Tensor staging_data; + at::Tensor semaphores; + if (grid.y > 1) { + staging_data = at::empty({2*stride*grid.y}, mean.options()); + semaphores = at::zeros({grid.x}, input.options().dtype(at::kInt)); + } + auto stream = at::cuda::getCurrentCUDAStream(); + + if (weight.defined() && input.scalar_type() != weight.scalar_type()) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] { + using accscalar_t = at::acc_type; + accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr() : nullptr; + int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr() : nullptr; + batch_norm_backward_reduce_channels_last_kernel + <<>>( + input.const_data_ptr(), + grad_output.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + sumn_dy.mutable_data_ptr(), + sum_dy_xmu.mutable_data_ptr(), + grad_weight.mutable_data_ptr(), + grad_bias.mutable_data_ptr(), + staging_data_ptr, + semaphores_ptr, + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } else { + if (weight.defined()) { + TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_reduce: input.scalar_type() ", input.scalar_type(), + " is not supported with weight.scalar_type() ", weight.scalar_type()); + } + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_reduce", [&] { + using accscalar_t = at::acc_type; + accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.mutable_data_ptr() : nullptr; + int* semaphores_ptr = grid.y > 1 ? semaphores.mutable_data_ptr() : nullptr; + batch_norm_backward_reduce_channels_last_kernel + <<>>( + input.const_data_ptr(), + grad_output.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + sumn_dy.mutable_data_ptr(), + sum_dy_xmu.mutable_data_ptr(), + weight.defined() ? grad_weight.mutable_data_ptr() : nullptr, + weight.defined() ? grad_bias.mutable_data_ptr() : nullptr, + staging_data_ptr, + semaphores_ptr, + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } + + return std::make_tuple(sumn_dy, sum_dy_xmu, grad_weight, grad_bias); +} + +at::Tensor batch_norm_backward_elemt_channels_last_cuda_template( + const at::Tensor& grad_output, + const at::Tensor& input, + const at::Tensor& mean, + const at::Tensor& inv_std, + const at::Tensor& weight, + const at::Tensor& sum_dy, + const at::Tensor& sum_dy_xmu, + const at::Tensor& count) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + + // Input is guaranteed to be channels-last compatible + at::Tensor grad_input = at::empty_like(input); + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid); + + auto stream = at::cuda::getCurrentCUDAStream(); + + if (weight.defined() && weight.scalar_type() != input.scalar_type()) { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] { + using accscalar_t = at::acc_type; + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.const_data_ptr(), + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + count.const_data_ptr(), + grad_input.mutable_data_ptr(), + count.numel(), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } else { + if (weight.defined()) { + TORCH_CHECK(input.scalar_type() == weight.scalar_type(), "batchnorm_backward_element: input.scalar_type() ", input.scalar_type(), + " is not supported with weight.scalar_type() ", weight.scalar_type()); + } + AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "batchnorm_backward_element", [&] { + using accscalar_t = at::acc_type; + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + count.const_data_ptr(), + grad_input.mutable_data_ptr(), + count.numel(), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + }); + } + + return grad_input; +} + +at::Tensor batch_norm_backward_elemt_channels_last_cuda_template( + const at::Tensor& grad_output, + const at::Tensor& input, + const at::Tensor& mean, + const at::Tensor& inv_std, + const at::Tensor& weight, + const at::Tensor& sum_dy, + const at::Tensor& sum_dy_xmu) { + const auto stride = input.sizes()[1]; + const auto reduction_size = input.numel() / stride; + auto norm_fct = 1.0 / reduction_size; + + // Input is guaranteed to be channels-last compatible + at::Tensor grad_input = at::empty_like(input); + + dim3 block; + dim3 grid; + flexible_launch_configs(reduction_size, stride, block, grid); + + auto stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "batchnorm_backward_element", [&] { + using accscalar_t = at::acc_type; + + if (weight.defined() && weight.scalar_type() != input.scalar_type()) { + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.const_data_ptr(), + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + grad_input.mutable_data_ptr(), + static_cast(norm_fct), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } else { + batch_norm_backward_elemt_channels_last_kernel + <<>>( + grad_output.const_data_ptr(), + input.const_data_ptr(), + mean.const_data_ptr(), + inv_std.const_data_ptr(), + weight.defined() ? weight.const_data_ptr() : nullptr, + sum_dy.const_data_ptr(), + sum_dy_xmu.const_data_ptr(), + grad_input.mutable_data_ptr(), + static_cast(norm_fct), + reduction_size, + stride); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } + }); + + return grad_input; +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Reduce.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..78104872731ac756aae3460d9852c345370f00d4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Reduce.cuh @@ -0,0 +1,1450 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at::native { + +static inline int64_t div_up(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +// returns floor(log2(n)) +static inline int last_pow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +// returns reduced fraction numerator & denominator +C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; +} + +//template for changing MAX_NUM_THREADS based on op dtype +template +struct mnt_wrapper { + static constexpr int MAX_NUM_THREADS = 512; +}; + +template <> +struct mnt_wrapper >{ + static constexpr int MAX_NUM_THREADS = 256; +}; + +constexpr int max_reduce_threads(c10::ScalarType type) { + return type == kComplexDouble ? 256 : 512; +} + +struct ReduceConfig { + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + ReduceConfig(int element_size_bytes, int num_outputs, int num_inputs) + : element_size_bytes(element_size_bytes) + , num_inputs(num_inputs) + , num_outputs(num_outputs) {} + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + template + void set_block_dimension(int64_t dim0, int64_t dim1) { + const int max_num_threads = mnt_wrapper::MAX_NUM_THREADS / output_vec_size; + int dim0_pow2 = dim0 < max_num_threads ? static_cast(last_pow2(dim0)) : max_num_threads; + int dim1_pow2 = dim1 < max_num_threads ? static_cast(last_pow2(dim1)) : max_num_threads; + block_width = std::min(dim0_pow2, int(at::cuda::warp_size())); + block_height = std::min(dim1_pow2, int(max_num_threads / block_width)); + block_width = std::min(dim0_pow2, int(max_num_threads / block_height)); + num_threads = block_width * block_height; + } + + int split_input(int parallelism) { + int step = step_input; + step_input *= parallelism; + return step; + } + + int split_output(int parallelism) { + int step = step_output; + step_output *= parallelism; + return step; + } + + dim3 block() const { + return dim3(block_width, block_height); + } + + dim3 grid() const { + return dim3(div_up(num_outputs / output_vec_size, step_output), ctas_per_output); + } + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + int shared_memory_size() const { + if (!should_block_y_reduce() && + (!should_block_x_reduce() || + block_width <= at::cuda::warp_size())) { + return 0; + } + return element_size_bytes * num_threads * output_vec_size; + } + + int64_t global_memory_size() const { + if (!should_global_reduce()) { + return 0; + } + auto size = (int64_t)element_size_bytes * num_outputs * ctas_per_output; + if (!should_block_x_reduce()) { + size *= block().x * output_vec_size; + } + return size; + } + + int semaphore_size() const { + if (!should_global_reduce()) { + return 0; + } + return sizeof(int) * grid().x; + } + + int values_per_thread() const { + return div_up(num_inputs, step_input); + } + + int mock_values_per_thread(int parallelism) { + return div_up(num_inputs, step_input * parallelism); + } +}; + +std::ostream& operator<<(std::ostream& out, const ReduceConfig& config); + +template +C10_LAUNCH_BOUNDS_2(nt, 4) +__global__ void reduce_kernel(R reduction) { + reduction.template run(); +} + +template +static OffsetCalculator<2, index_t> make_output_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int num_output_dims = iter.ndim() - num_reduce_dims; + int input_index = iter.ntensors() - 1; + int output_index = 0; + std::array strides = { + iter.strides(output_index).data() + num_reduce_dims, + iter.strides(input_index).data() + num_reduce_dims, + }; + auto shape = iter.shape().data() + num_reduce_dims; + return OffsetCalculator<2, index_t>(num_output_dims, shape, strides.data()); +} + +template +static OffsetCalculator<1, index_t> make_input_calculator(const TensorIterator& iter) { + int num_reduce_dims = iter.num_reduce_dims(); + int input_index = iter.ntensors() - 1; + std::array strides = { + iter.strides(input_index).data(), + }; + return OffsetCalculator<1, index_t>(num_reduce_dims, iter.shape().data(), strides.data()); +} + +template +struct func_wrapper_t { + using arg_t = typename binary_function_traits::arg1_t; + using scalar_t = typename binary_function_traits::arg2_t; + + func_t combine; + static inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + static inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + static __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + func_wrapper_t(const func_t& op) : combine(op) { + } + + // wrap a normal reduction that ignores the index + __device__ arg_t reduce(arg_t acc, scalar_t val, int64_t idx) const { + return combine(acc, val); + } +}; + +template +func_wrapper_t func_wrapper(const func_t& op) { + return func_wrapper_t { op }; +} + +template +struct ReduceJitOp { +//ReduceJitOp is almost like ReduceOp, but it doesn't have ops functor that specifies reduction operations +//Maybe we can find a way to unify ReduceOp and ReduceJitOp + using InputCalculator = OffsetCalculator<1, uint32_t>; + using OutputCalculator = OffsetCalculator<2, uint32_t>; + //TODO for now arg_t is always opmath_t of the input, later we'll need to change it + using arg_t = at::opmath_type; + + //TODO - ReduceJitOp will probably need to be changed for reductions that need full functor, + //not just wrapper + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceJitOp( + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + std::optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } +}; + +template +struct ReduceOp { + using traits = function_traits; + using arg_t = typename std::decay::type>::type; + + using InputCalculator = OffsetCalculator<1, index_t>; + using OutputCalculator = OffsetCalculator<2, index_t>; + + static constexpr bool can_accumulate_in_output = + std::is_convertible_v + && std::is_convertible_v; + + ops_t ops; + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + ReduceOp( + ops_t ops, + ReduceConfig config, + InputCalculator input_calc, + OutputCalculator output_calc, + const void* src, + char* dst0, + std::optional dst1, + void* acc_buf, + void* cta_buf, + int* semaphores, + arg_t ident, + int noutputs, + int64_t base_idx) + : ops(ops), + ident(ident), + config(config), + input_calc(input_calc), + output_calc(output_calc), + src(src), + acc_buf(acc_buf), + cta_buf(cta_buf), + semaphores(semaphores), + base_idx(base_idx), + noutputs(noutputs) { + dst[0] = dst0; + if (dst1.has_value()) { + dst[1] = dst1.value(); + } + } + + template + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + index_t output_idx = config.output_idx(); + index_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = std::array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + value = thread_reduce(input_slice); + } + + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (config.should_block_y_reduce()) { + value = block_y_reduce(value, shared_memory); + } + using out_ptr_vec_t = std::array; + using offset_vec_t = std::array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE std::array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + CUDA_KERNEL_ASSERT(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + index_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](index_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](index_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](index_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + index_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(at::native::memory::aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((uint64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = ops.reduce(value, c10::load(data + threadIdx.x), threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = at::native::memory::aligned_vector; + + index_t idx = config.input_idx(); + const index_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + while (idx * input_vec_size + input_vec_size - 1 < end) { + const auto values_vec = memory::load_vector(data, idx); + #pragma unroll + for (index_t i = 0; i < input_vec_size; i++) { + value_list[i] = ops.reduce(value_list[i], values_vec.val[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + index_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + const auto value = c10::load(data + idx); + value_list[0] = ops.reduce(value_list[0], value, idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = ops.combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE std::array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + index_t idx = config.input_idx(); + const index_t end = config.num_inputs; + const index_t stride = config.step_input; + + using arg_vec_t = std::array; + using load_t = at::native::memory::aligned_vector; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + const auto offset = calc(idx + i * stride) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + } + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + const auto offset = calc(idx) / output_vec_size; + values[i] = memory::load_vector(data_, offset); + idx += stride; + } + idx = idx_; + #pragma unroll + for (index_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = ops.reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (index_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = ops.combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + + template + C10_DEVICE std::array block_x_reduce(std::array value, char* shared_memory) const { + using args_vec_t = std::array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + // Intra-warp reduction, fix CUDA to have offset decreasing for better numerics + // matching Triton, etc. + // TODO(PaulZhang12): AMD and internal + #if defined(USE_ROCM) || defined(FBCODE_CAFFE2) + for (int offset = 1; offset < dim_x; offset <<= 1) { + #else + for (int offset = dim_x >> 1; offset > 0; offset >>= 1) { + #endif + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = ops.warp_shfl_down(value[i], offset); + value[i] = ops.combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE std::array block_y_reduce(std::array value, char* shared_memory) const { + using args_vec_t = std::array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE std::array accumulate_in_output( + std::array out, + std::array value, + typename std::enable_if_t* = nullptr + ) const { + std::array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = ops.combine(*(out[i]), value[i]); + } + return ret; + } + + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if_t* = nullptr + ) const { + CUDA_KERNEL_ASSERT(!final_output); + return (out_scalar_t)value; + } + + // This function should never be called -- + // it's the version of `accumulate_in_output` + // when accumulation in the output is not possible. + template + C10_DEVICE std::array accumulate_in_output( + std::array, + std::array, + typename std::enable_if_t* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return {arg_t{}}; + } + + // This function should never be called -- + // it's the version of `get_accumulated_output` + // when accumulation in the output is not possible. + template + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value, + typename std::enable_if_t* = nullptr + ) const { + CUDA_KERNEL_ASSERT(false); + return *out; + } + + template + C10_DEVICE void set_results(const T x, const index_t base_offset) const { + CUDA_KERNEL_ASSERT(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + + //Currently implemented for max of two outputs + template + C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { + if (noutputs >= 1) { + auto res0 = (T1*)((char*)dst[0] + base_offset); + *res0 = x.first; + } + if (noutputs >= 2) { + // base offset is computed assuming element size being sizeof(T1), so we need to make a + // correction to obtain the correct base offset + auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); + *res1 = x.second; + } + } + + template + C10_DEVICE void set_results_to_output(std::array value, std::array base_offset) const { + CUDA_KERNEL_ASSERT(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(ops.project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE std::array global_reduce(std::array value, std::array *acc, char* shared_memory) const { + using arg_vec_t = std::array; + using out_ptr_vec_t = std::array; + using offset_vec_t = std::array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + index_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + index_t offset = config.staging_memory_offset(blockIdx.y); +#ifndef USE_ROCM + reduce_buffer[offset] = value; +#else // [CMTSTRS] + // In architectures with split caches, global fences are costly. + // Here we preempt need for fences by committing stores to global memory. + cmtdStore(&reduce_buffer[offset], value); +#endif + } + +#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS] + __threadfence(); // make sure writes are globally visible +#endif + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { +#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS] + __threadfence(); // complete the acquire pattern after atomic +#endif + for (auto &v : value) { + v = ident; + } + if (config.should_block_x_reduce()) { + index_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + index_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } + } else { + index_t input_offset = threadIdx.y; + index_t step = blockDim.y; +#ifdef USE_ROCM // Prefetch loads to better hide their latency + #define PRFCH 4 + for (; input_offset < config.ctas_per_output; input_offset += step*PRFCH) { + arg_vec_t next[PRFCH]; + #pragma unroll + for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) { + index_t idx = config.staging_memory_offset(input_offset + u*step); + next[u] = reduce_buffer[idx]; + } + for (int u = 0; (u < PRFCH) && (input_offset + u*step < config.ctas_per_output); u++) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[u][i]); + } + } + } +#else + for (; input_offset < config.ctas_per_output; input_offset += step) { + index_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine(value[i], next[i]); + } + } +#endif + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = ops.combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +template +static void launch_reduce_kernel(const ReduceConfig& config, const R& reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + auto stream = at::cuda::getCurrentCUDAStream(); + int shared_memory = config.shared_memory_size(); + + switch(config.output_vec_size) { + case 4: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + case 2: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + break; + default: + reduce_kernel<<>>(reduction); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +inline void launch_jitted_reduce_kernel( + std::mutex &jiterator_mutex, + std::array &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int vt0, const ReduceConfig& config, const void *reduction) { + dim3 block = config.block(); + dim3 grid = config.grid(); + + int shared_memory = config.shared_memory_size(); + at::cuda::jit::NvrtcFunction* fn_ptr; + switch(config.output_vec_size) { + case 4: + fn_ptr = &fn_cache[0]; + break; + case 2: + fn_ptr = &fn_cache[1]; + break; + default: + fn_ptr = &fn_cache[2]; + } + if (!fn_ptr->function) { + int max_threads_codegen = + max_reduce_threads(desc.f_inputs_type) / config.output_vec_size; + auto code = at::cuda::jit::generate_reduction_code( + desc, vt0, true, false, config.output_vec_size, max_threads_codegen); + + *fn_ptr = at::cuda::jit::jit_pwise_function(code, "reduction_" + desc.name); + } + constexpr int kernel_args = 1; + const void* args[kernel_args]; + args[0] = reduction; + at::cuda::jit::launch_jitted_pwise_function(*fn_ptr, args, grid, block, shared_memory); +} + + +class AccumulationBuffer { + public: + AccumulationBuffer() {} + + AccumulationBuffer(size_t acc_t_size, size_t out_t_size, char* out_ptr, int64_t size) { + out_ptr_ = (char*)out_ptr; + if (out_t_size >= acc_t_size) { + // reusing output buffer for accumulation. + acc_ptr_ = (char*)out_ptr; + numerator_ = 1; + denominator_ = 1; + } else { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer_ = allocator.allocate(size); + acc_ptr_ = (char*)buffer_.get(); + numerator_ = acc_t_size; + denominator_ = out_t_size; + reduce_fraction(numerator_, denominator_); + } + } + + char* get_acc_slice(char* out_ptr) { + if (acc_ptr_ == nullptr) { + return nullptr; + } + return acc_ptr_ + ((out_ptr - out_ptr_) * numerator_ / denominator_); + } + + private: + char* acc_ptr_ = nullptr; + char* out_ptr_ = nullptr; + size_t numerator_; + size_t denominator_; + at::DataPtr buffer_; +}; + +template +int get_output_vec_size(const TensorIterator &iter) { + int vec_size = 4; + auto update_vec_size = [&vec_size](uint64_t n) { + while(n % vec_size != 0) { + vec_size /= 2; + } + }; + + uint64_t base_address = reinterpret_cast(iter.data_ptr(iter.noutputs())) / sizeof(scalar_t); + update_vec_size(base_address); + + const int output_index = iter.num_reduce_dims(); + update_vec_size(iter.shape()[output_index]); + + int j = 0; + for(auto i : iter.strides(iter.noutputs())) { + if (j != output_index) { + update_vec_size(i / sizeof(scalar_t)); + } + j++; + } + return vec_size; +} + +template +ReduceConfig setReduceConfig(const TensorIterator& iter){ + // Start by assuming that each thread handles a single output and all + // the inputs for that output. + int64_t num_outputs = iter.num_output_elements(); + int64_t inputs_per_output = iter.numel() / num_outputs; + int input_index = iter.ntensors() - 1; + + auto config = ReduceConfig(sizeof(arg_t), num_outputs, inputs_per_output); + + int64_t dim0; + int64_t dim1; + int64_t fastest_moving_stride; + bool reduction_on_fastest_striding_dimension; + + if (iter.ndim() > 0) { + // Adjust block size to map block width to fastest changing dimension of input + // tensor. This grants the best possible memory accessing pattern, given that + // for non-contiguous tensor with space in between, we cannot have perfect + // memory coalescing. + reduction_on_fastest_striding_dimension = + (iter.num_reduce_dims() == iter.ndim()) || + (iter.strides(/*arg=*/input_index)[0] < + iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]); + // Notice that dim0 & dim1 does NOT guarantee any launch configuration here! + // dim0 & dim1 are more like the upper bound of the block dimension. The + // actual launch config and reduction scheme is determined by setting values + // to `config.input_mult` and `config.output_mult`. + // We try to max out dim1 so that we have enough threads per CTA to deliver + // performance for larger problem size. + if (reduction_on_fastest_striding_dimension) { + // Map block.x to the fastest reducing dimension. It implies: + // 1. block_x_reduce is required. + // 2. block.y now max out to num_outputs. + dim0 = inputs_per_output; + dim1 = num_outputs; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[0]; + } else { + // Map block.x to the fastest non reducing dimension. It implies: + // 1. block_x_reduce is turned off. + // 2. block.y now max out to inputs_per_output. + dim0 = num_outputs; + dim1 = inputs_per_output; + fastest_moving_stride = iter.strides(/*arg=*/input_index)[iter.num_reduce_dims()]; + } + } else { + reduction_on_fastest_striding_dimension = true; + fastest_moving_stride = sizeof(scalar_t); + dim0 = 1; + dim1 = 1; + } + + // We do vectorization to gain better memory access, there are two cases which we call + // "vectorize along input" and "vectorize along output". Note that the "input/output" + // here does not mean we are vectorizing load/store instructions. We always only vectorize + // load instructions. + // + // Case 1: "vectorize along input" + // This case happens when we are reducing along fastest moving dimension. In such case, threads + // with the same threadIdx.y works on the same reduction cooperatively and will produce results + // for the same output. In such case, values in each loaded vector always correspond to the same output. + // + // Case 2: "vectorize along output" + // This case happens when the fastest moving dimension is not the dimension of reduction. In such case, + // threads with different threadIdx.x are independent and will produce results for different outputs. + // In such case, values in each loaded vector always correspond to different outputs. + if (fastest_moving_stride == sizeof(scalar_t)) { + if (reduction_on_fastest_striding_dimension && dim0 >= 128 && iter.num_reduce_dims() == 1) { + // Case 1: "vectorize along input" + // Note that if vt0 < ReduceConfig::vec_size, then this means the register pressure could be high, in such case, + // we should avoid vectorization. + config.vectorize_input = true; + dim0 /= input_vec_size; + } else if (!reduction_on_fastest_striding_dimension) { + // Case 2: "vectorize along output" + config.output_vec_size = get_output_vec_size(iter); + dim0 /= config.output_vec_size; + } + } + + // Adjust block_width and block_height + config.set_block_dimension(dim0, dim1); + + int block_width = config.block_width; + int block_height = config.block_height; + + if (iter.ndim() == 0 || reduction_on_fastest_striding_dimension) { + // Split the input across lanes if the input is contiguous in the reduced + // dimension. This will require reduction between threads using warp + // shuffle instructions and shared memory (if block_width > warpSize). + config.input_mult[0] = config.split_input(block_width); + } else { + // Otherwise split the output across lanes in a warp. + config.output_mult[0] = config.split_output(block_width); + } + + constexpr int min_values_per_thread = 16; + constexpr int max_values_per_thread = 256; + + const int warp_split_threshold = + std::min(block_height * 16, max_values_per_thread); + bool split_across_warps = config.values_per_thread() >= warp_split_threshold; + const int num_mp = + at::cuda::getCurrentDeviceProperties()->multiProcessorCount; +#ifdef USE_ROCM + bool force_splitting_output = iter.ndim() == 2 && + reduction_on_fastest_striding_dimension && + config.values_per_thread() < 1024 && num_mp < 100; + split_across_warps = !force_splitting_output && split_across_warps; +#endif + + if (split_across_warps) { + // Divide the input across warps in a thread-block, if that leaves at least + // 16 elements to be summed by each thread. This will require inter-warp + // reduction using shared memory. + config.input_mult[1] = config.split_input(block_height); + } else { + // Otherwise, each warp handles a separate output. + config.output_mult[1] = config.split_output(block_height); + } + + int max_threads_per_mp = + at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor; +#ifdef USE_ROCM + // If the grid consists of a single threadblock, do not change the max threads per + // MP value. This will increase the parallelism across the y dimension of the grid. + bool uses_a_single_block = config.grid().x == config.grid().y == config.grid().z == 1; + + if (!uses_a_single_block) { + // Control the number of threadblocks by adjusting the maximum number of + // threads per multi-processor. These numbers better reflect the maximum + // theoretical achievable threads per MP for the reduction operation. + if (iter.ndim() == 1 || iter.ndim() == 3) + max_threads_per_mp = 512; + else if (iter.ndim() == 2) + max_threads_per_mp = 256; + } +#endif + const int blocks_per_sm = max_threads_per_mp / config.num_threads; + const int target_grid_size = num_mp * blocks_per_sm; + int grid = config.grid().x; + if (config.input_mult[1] != 0 && config.values_per_thread() >= max_values_per_thread && grid <= target_grid_size) { + // Divide the input across thread-blocks if the amount of work per-thread + // is large enough and the size of the output is small enough. This will + // require a reduction using global memory. + // If we decide to split input across blocks, as long as we can get enough + // number of blocks (`target_grid_size`) to balance SM, we should still + // make the number of values per thread large for best performance. + int ctas_per_output1 = div_up(target_grid_size, grid); + int ctas_per_output2 = div_up(config.values_per_thread(), min_values_per_thread); + int ctas_per_output3 = div_up(config.values_per_thread(), max_values_per_thread); + // We want the minimum of ctas_per_output1 and ctas_per_output2, so that each thread can have + // a large number of values to deal with. But we don't want values_per_thread to be larger than + // max_values_per_thread + config.ctas_per_output = std::max(std::min(ctas_per_output1, ctas_per_output2), ctas_per_output3); +#ifdef USE_ROCM + // In cases where a number of threadblocks along the y direction of the grid + // is needed then make sure they are reduced to the number of MPs. For + // smaller sizes, use half the number of MPs. For smaller sizes than half + // the number of MPs use the original value unless the value is less than 16 + // blocks in which case it is more profitable to use just 1 block. + if (config.ctas_per_output > num_mp) + if (num_mp < 128) + config.ctas_per_output = + num_mp * (config.ctas_per_output > 512 ? 4 : 2); + else + config.ctas_per_output = num_mp; + else if (config.ctas_per_output > div_up(num_mp, 2)) + config.ctas_per_output = div_up(num_mp, 2); + else if (config.ctas_per_output < 16) + config.ctas_per_output = 1; + bool is_channel_last = iter.tensor_base(1).is_contiguous(at::MemoryFormat::ChannelsLast); + if (iter.ndim() == 3 && !reduction_on_fastest_striding_dimension && !is_channel_last) { + config.ctas_per_output = 4; + int vpt = config.values_per_thread(); + // Capping the number of values per thread to 2048 for now + // based on known use cases. + while (vpt >= 2048) { + config.ctas_per_output *= 2; + // Computes the new values per thread without side effects + vpt = config.mock_values_per_thread(config.ctas_per_output); + } + } +#endif + if (config.ctas_per_output > 1) { + config.input_mult[2] = config.split_input(config.ctas_per_output); + } + } + return config; +}; + +template +inline void gpu_reduce_kernel(TensorIterator& iter, const ops_t& ops, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + using traits = function_traits; + using arg_t = typename traits::template arg<0>::type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same_v && + std::is_same_v) || + (std::is_same_v, scalar_t> && + std::is_same_v, out_scalar_t>); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same_v && + std::is_same_v); + static constexpr bool can_accumulate_in_output = + std::is_convertible_v && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(arg_t), + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(arg_t))); + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + gpu_reduce_kernel(sub_iter, ops, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + std::optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = std::nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceOp( + ops, + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + launch_reduce_kernel::MAX_NUM_THREADS>(config, reduce); +} + +//TODO this is 100 lines of almost-copy-paste, because we have to have different template args for this function +//try unifying with gpu_reduce_kernel +template +inline void jitted_gpu_reduce_kernel(TensorIterator& iter, const std::string& func, ident_t ident=0, + AccumulationBuffer* acc_buf_ptr=nullptr, int64_t base_idx=0) { + AT_ASSERT(iter.numel() > 0 && iter.ntensors() - iter.noutputs() == 1 && iter.noutputs() >= 1); + + //TODO - this will be different for more complicated reductions, but for now reductions using + //func_wrapper all have arg_t = opmath + using arg_t = at::opmath_type; + // at::Half/at::ComplexHalf overflows easily as it's range is very small. + // So when scalar_t and out_scalar_t are at::Half/at::ComplexHalf, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_half_or_chalf = + (std::is_same_v && + std::is_same_v ) || + (std::is_same_v, scalar_t> && + std::is_same_v, out_scalar_t>); + // at::BFloat16 has lower precision and can lead to rounding errors. + // So when scalar_t and out_scalar_t are at::BFloat16, we + // set can_accumulate_in_output to False. + static constexpr bool is_inp_out_type_bfloat16 = + (std::is_same_v && + std::is_same_v); + static constexpr bool can_accumulate_in_output = + std::is_convertible_v && + !(is_inp_out_type_half_or_chalf || is_inp_out_type_bfloat16); + + bool can_use_32bit_indexing = iter.can_use_32bit_indexing(); + std::unique_ptr owned_buf_ptr; + + // The acc_buf_ptr is a shared pointer. It is create at the first entrance and + // reused by all recursive function calls. + if (acc_buf_ptr == NULL) { + // acc_buf_ptr holds buffer used for accumulation among multiple sub_iter + // when accumulation in output is not possible. + if (!can_accumulate_in_output && !can_use_32bit_indexing) { + int64_t output_memory_size = iter.element_size(0); + for (int dim = 0; dim < iter.ndim(); dim++) { + output_memory_size = std::max(output_memory_size, iter.shape()[dim] * iter.strides(0)[dim]); + } + output_memory_size /= iter.element_size(0); //iter.strides is in bytes + owned_buf_ptr.reset(new AccumulationBuffer(sizeof(out_scalar_t), //TODO + sizeof(out_scalar_t), + (char*) iter.data_ptr(0), + output_memory_size * sizeof(out_scalar_t))); //TODO + } else { + owned_buf_ptr.reset(new AccumulationBuffer()); + } + acc_buf_ptr = owned_buf_ptr.get(); + } + + if (!can_use_32bit_indexing) { + for (auto& sub_iter : iter.with_32bit_indexing()) { + int64_t sub_iter_base_idx = sub_iter.view_offsets()[0]; + + jitted_gpu_reduce_kernel(sub_iter, func, ident, + acc_buf_ptr, sub_iter_base_idx); + } + return; + } + + //TODO - for now we support a single input, we may be able to relax this constraint + const char* in_data = (char*)iter.data_ptr(iter.ntensors() - 1); + char* out_data = (char*)iter.data_ptr(0); + const auto noutputs = iter.noutputs(); + std::optional out_data_extra; + if (noutputs > 1) { + out_data_extra = (char*)iter.data_ptr(1); + } else { + out_data_extra = std::nullopt; + } + char* acc_data = acc_buf_ptr->get_acc_slice(out_data); + + ReduceConfig config = setReduceConfig(iter); + + at::DataPtr buffer; + at::DataPtr semaphores; + if (config.should_global_reduce()) { + auto& allocator = *c10::cuda::CUDACachingAllocator::get(); + buffer = allocator.allocate(config.global_memory_size()); + semaphores = allocator.allocate(config.semaphore_size()); + + auto stream = at::cuda::getCurrentCUDAStream(); + AT_CUDA_CHECK(cudaMemsetAsync(semaphores.get(), 0, config.semaphore_size(), stream)); + } + + AT_ASSERT(can_use_32bit_indexing); + auto output_calc = make_output_calculator(iter); + auto input_calc = make_input_calculator(iter); + auto reduce = ReduceJitOp( + config, + input_calc, + output_calc, + in_data, + out_data, + out_data_extra, + acc_data, + buffer.get(), + (int*)semaphores.get(), + ident, + noutputs, + base_idx); + reduce.accumulate = iter.should_accumulate(); + reduce.final_output = iter.is_final_output(); + + constexpr int nInputs = 1; + constexpr int nOutputs = 1; + static auto desc = at::cuda::jit::make_kernel_descriptor< + out_scalar_t, scalar_t>(name, func, nInputs, nOutputs); + + static std::mutex jiterator_mutex; + static std::vector> fn_cache(c10::cuda::device_count()); + auto &cache = fn_cache[iter.device().index()]; + + launch_jitted_reduce_kernel( + jiterator_mutex, cache, desc, vt0, config, &reduce); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ReduceOps.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..16394d6d931a4468b129ff16eef2094ced4ece93 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ReduceOps.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +namespace at { +struct TensorIterator; +} + +namespace c10 { +class Scalar; +} + +namespace at::native { + +void norm_launch_kernel(TensorIterator &iter, double val); +void min_launch_kernel(TensorIterator &iter); +void max_launch_kernel(TensorIterator &iter); +void aminmax_launch_kernel(TensorIterator &iter); +void min_all_launch_kernel(TensorIterator &iter); +void max_all_launch_kernel(TensorIterator &iter); +void aminmax_allreduce_launch_kernel(TensorIterator &iter); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Resize.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Resize.h new file mode 100644 index 0000000000000000000000000000000000000000..8c0155f888a6943fb4267d639fd57baf5887c0a7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Resize.h @@ -0,0 +1,58 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +#include + +namespace at::native { + +TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes); + +static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) { + // It does not make sense to try to resize a storage + // to hold 0 elements, and this can break + // if storage_offset is positive but + // new_size is 0, so just bail in that case + // (same comment is in Resize.h) + if (self->numel() == 0) { + return; + } + + const Storage &storage = self->unsafe_storage(); + TORCH_CHECK(storage, "Tensor: invalid null storage"); + if (new_size_bytes > storage.nbytes()) { + resize_bytes_cuda(storage.unsafeGetStorageImpl(), new_size_bytes); + } +} + +inline TensorImpl* resize_impl_cuda_( + TensorImpl* self, + IntArrayRef size, + at::OptionalIntArrayRef stride) { + if (self->sizes() == size && (!stride || self->strides() == stride)) { + return self; + } + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; + if (stride) { + self->set_sizes_and_strides(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); + } else { + self->set_sizes_contiguous(size); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); + } + maybe_resize_storage_cuda(self, storage_size); + + return self; +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/RowwiseScaledMM.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/RowwiseScaledMM.h new file mode 100644 index 0000000000000000000000000000000000000000..613bb0705dfd668e5eeb3a94a52fe4f0142551a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/RowwiseScaledMM.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::cuda::detail { +TORCH_API void f8f8bf16_rowwise( + at::Tensor XQ, // FP8 + at::Tensor WQ, // FP8 + at::Tensor x_scale, // FP32 + at::Tensor w_scale, // FP32 + std::optional bias, // BF16 + bool use_fast_accum, + at::Tensor& out); +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScaledGroupMM.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScaledGroupMM.h new file mode 100644 index 0000000000000000000000000000000000000000..d246ce955364ba361cac7f48d64ff94bce23c43e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScaledGroupMM.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::cuda::detail { +TORCH_API void f8f8bf16_grouped_mm( + at::Tensor mat_a, // FP8 + at::Tensor mat_b, // FP8 + at::Tensor scale_a, // FP32 + at::Tensor scale_b, // FP32 + std::optional offs, + std::optional bias, // BF16 + bool use_fast_accum, + at::Tensor& out); +} // namespace at::cuda::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4de7c7a0f7ad7f1cf2800505c35de5063cf7cefa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/ScanUtils.cuh @@ -0,0 +1,485 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +#include +#include +#include + +namespace at::native { + +template +constexpr inline integer ceil_div(integer n, integer m) { + return (n + m - 1) / m; +} + +template +constexpr inline integer get_log_num_threads_x_inner_scan(integer num_rows, integer row_size) { + integer log_num_threads_x = 0; + integer log_num_threads_y = 0; + while (((integer)1 << log_num_threads_x) < row_size) { + ++log_num_threads_x; + } + while (((integer)1 << log_num_threads_y) < num_rows) { + ++log_num_threads_y; + } + // we want to keep the ratio between the x-threads and y-threads about the same as + // the ratio between the row_size and num_rows, but the total number of threads in + // a block should be about 512 + integer diff = log_num_threads_x - log_num_threads_y; + // 9 is from log2(512) + log_num_threads_x = ((integer)9 + diff) / (integer)2; + // I found that in having larger log_num_threads_x can give significant speed up in some cases, + // but detrimental in another case, so just keep the lower bound to be log2(16) == 4 to make it + // similar to the previous implementation + // Keeping the upper bound to be log2(512) == 9 as the maximum number of threads in a block. + log_num_threads_x = std::min(std::max((integer)4, log_num_threads_x), (integer)9); + return log_num_threads_x; +} + +template +__device__ void binary_op_update(const scalar_t lhs, scalar_t& rhs, const idx_t lhs_idx, idx_t& rhs_idx, BinaryOperation binary_op) { + if(!at::_isnan(rhs) && (at::_isnan(lhs) || !binary_op(rhs, lhs))) { + rhs = lhs; + rhs_idx = lhs_idx; + } +} +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__global__ void tensor_kernel_scan_innermost_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_, + int num_rows, int row_size, + const uint32_t num_threads, const uint32_t log_num_threads_x, + scalar_t init, BinaryFunction binary_op) { + // dynamic memory allocation for vbuf and ibuf + alignas(sizeof(double)) extern __shared__ char buf[]; + scalar_t* vbuf = reinterpret_cast(buf); // the size is num_threads * 2 + int64_t* ibuf = reinterpret_cast(vbuf + num_threads * 2); + const uint32_t num_threads_x = 1 << log_num_threads_x; + scalar_t* row_buf = vbuf + 2 * num_threads_x * threadIdx.y; + int64_t* row_idx_buf = ibuf + 2 * num_threads_x * threadIdx.y; + + for (int block_row = blockIdx.x * blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + int row = block_row + threadIdx.y; + const scalar_t *row_self = self_ + row * row_size; + scalar_t *row_values = values_ + row * row_size; + int64_t *row_indices = indices_ + row * row_size; + scalar_t block_total = init; + int64_t block_idx_final = 0; + const bool row_exists = row < num_rows; + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (int block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + int col1 = block_col + threadIdx.x; + int col2 = block_col + num_threads_x + threadIdx.x; + if (row_exists) { + if (col1 < row_size) { + row_buf[threadIdx.x] = c10::load(&row_self[col1]); + row_idx_buf[threadIdx.x] = col1; + } else { + row_buf[threadIdx.x] = init; + // No need to set the index here as the value in init will never be selected + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = c10::load(&row_self[col2]); + row_idx_buf[num_threads_x + threadIdx.x] = col2; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + // No need to set the index here as the value in init will never be selected + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + binary_op_update(block_total, row_buf[0], block_idx_final, row_idx_buf[0], binary_op); + } + } + __syncthreads(); + + // Parallel reduction with Sklansky method. The diagram can be seen on this paper: + // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + for (uint32_t s = 1; s <= num_threads_x; s <<= 1) { + if (row_exists) { + uint32_t a = (threadIdx.x / s) * (2 * s) + s; + uint32_t ti = a + (threadIdx.x % s); + uint32_t si = a - 1; + binary_op_update(row_buf[si], row_buf[ti], row_idx_buf[si], row_idx_buf[ti], binary_op); + } + __syncthreads(); + } + + // Write back to output. + if (row_exists) { + if (col1 < row_size){ + row_values[col1] = row_buf[threadIdx.x]; + row_indices[col1] = row_idx_buf[threadIdx.x]; + } + if (col2 < row_size) { + row_values[col2] = row_buf[num_threads_x + threadIdx.x]; + row_indices[col2] = row_idx_buf[num_threads_x + threadIdx.x]; + } + } + block_total = row_buf[2 * num_threads_x - 1]; + block_idx_final = row_idx_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to compute the variance; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void tensor_kernel_scan_outer_dim_with_indices(const scalar_t *self_, scalar_t *values_, int64_t *indices_, + const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, scalar_t init, BinaryFunction binary_op) { + for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + const scalar_t *self = self_ + orow * row_size * num_irows + irow; + scalar_t *values = values_ + orow * row_size * num_irows + irow; + int64_t *indices = indices_ + orow * row_size * num_irows + irow; + scalar_t out = init; + int64_t out_idx = 0; + + for (auto col = decltype(row_size){0}; col < row_size; ++col) { + const auto val = c10::load(self); + if(at::_isnan(val) || (!at::_isnan(out) && binary_op(val, out))) { + out = val; + out_idx = col; + } + *values = out; + *indices = out_idx; + self += num_irows; + values += num_irows; + indices += num_irows; + } + } + } +} + +inline void check_fits_in_unsigned(int64_t val, const char* name) { + constexpr auto umax = std::numeric_limits::max(); + TORCH_CHECK( + val >= 0 && val <= umax, name, " must fit in a 32-bit uint32_t value"); +} + + +template +__host__ void scan_outer_dim_with_indices( + const TensorBase& self, const TensorBase& values, const TensorBase& indices, + int dim, scalar_t init, BinaryFunction binary_op) { + int64_t row_size = self.size(dim); + auto sizes = self.sizes(); + + // Treat all outer dimensions (i.e. dim_ < dim) as one. + const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim); + + // Treat all inner dimensions (i.e. dim > dimension) as one. + const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end()); + //for performance reasons, cuda kernels use uint32_t for loops over irows, orows and row, + //make sure that input is not bigger than supported by uint32_t + check_fits_in_unsigned(num_irows, "num_irows"); + check_fits_in_unsigned(num_orows, "num_orows"); + check_fits_in_unsigned(row_size, "row_size"); + + + dim3 threads(std::min(512, int(num_irows))); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x}))); + tensor_kernel_scan_outer_dim_with_indices<<>>( + self.const_data_ptr(), values.mutable_data_ptr(), indices.mutable_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__host__ void scan_innermost_dim_with_indices( + const TensorBase& self, const TensorBase& values, const TensorBase& indices, + scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + // Treat all outer dimensions as a single dimension. + int row_size = self.size(ndim - 1); + int num_rows = self.numel() / row_size; + + // assuming max_num_threads per block is 512 + const uint32_t num_threads = 512; + const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan(num_rows, row_size); + const uint32_t num_threads_x = (1 << log_num_threads_x); + const uint32_t num_threads_y = num_threads / num_threads_x; + dim3 threads(num_threads_x, num_threads_y); + dim3 grid(std::min(at::cuda::getCurrentDeviceProperties()->maxGridSize[0], ceil_div(num_rows, int(threads.y)))); + + const uint32_t mem_size = 2 * num_threads * (sizeof(scalar_t) + sizeof(int64_t)); + tensor_kernel_scan_innermost_dim_with_indices<<>>( + self.const_data_ptr(), values.mutable_data_ptr(), indices.mutable_data_ptr(), + num_rows, row_size, num_threads, log_num_threads_x, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_dim_with_indices(const TensorBase& self, const TensorBase& values, const TensorBase& indices, //int64_t dim) { + int64_t dim, scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + auto self_ = self.expect_contiguous(); + TORCH_INTERNAL_ASSERT(values.is_contiguous() && indices.is_contiguous()); + if (dim == ndim - 1) { + scan_innermost_dim_with_indices(*self_, values, indices, init, binary_op); + } else { + scan_outer_dim_with_indices(*self_, values, indices, dim, init, binary_op); + } +} + +// TODO: The implementation of `tensor_kernel_scan_outer_dim` and +// `tensor_kernel_scan_innermost_dim` is similar to +// `tensor_kernel_scan_outer_dim_with_indices` +// `tensor_kernel_scan_outer_dim_with_indices` and should be refactored to +// remove the duplication. + +/* Perform an inclusive scan along an outer dimension of a tensor. + * + * - num_orows is the size of the flattened outer dimensions; + * - num_irows is the size of the flattened inner dimensions; + * - row_size is the size of the dimension along which to scan; + * + * The dimensions to the outside and inside of the specified dimension are considered as flattened. + * Thread blocks with the same blockIdx.y process an "outer row" (i.e. an element of the flattened + * outer dimensions, which contains several "inner rows"). + * Each thread processes a single inner row at a time. + */ +template +__global__ void tensor_kernel_scan_outer_dim(scalar_t *tgt_, const scalar_t *src_, + const uint32_t num_orows, const uint32_t num_irows, const uint32_t row_size, + const scalar_t init, BinaryOp binary_op) +{ + for (uint32_t orow = blockIdx.x; orow < num_orows; orow += gridDim.x) { + for (uint32_t irow = blockIdx.y * blockDim.x + threadIdx.x; irow < num_irows; irow += gridDim.y * blockDim.x) { + const scalar_t *src = src_ + static_cast(orow) * row_size * num_irows + irow; + scalar_t *tgt = tgt_ + (index_t) orow * row_size * num_irows + irow; + scalar_t acc = init; + + for (uint32_t col = 0; col < row_size; ++col) { + acc = binary_op(acc, c10::load(src)); + *tgt = acc; + + src += num_irows; + tgt += num_irows; + } + } + } +} + +/* Perform an inclusive scan along the innermost dimension of a tensor. + * + * - num_rows is the size of the flattened outer dimensions; + * - row_size is the size of the innermost dimension; + * + * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is + * considered as having 'num_rows' rows of size 'row_size'. + * Each thread block processes one or more sets of contiguous rows (processing multiple rows + * per thread block is quicker than processing a single row, especially for short rows). + */ +template +__device__ void tensor_kernel_scan_innermost_dim_impl(T* row_buf, T *tgt_, const T *src_, + const uint32_t num_rows, const uint32_t row_size, + const uint32_t log_num_threads_x, + T init, BinaryFunction binary_op){ + const index_t num_threads_x = 1 << log_num_threads_x; + for (index_t block_row = blockIdx.x * (index_t) blockDim.y; + block_row < num_rows; + block_row += blockDim.y * gridDim.x) { + index_t row = block_row + (index_t) threadIdx.y; + T block_total = init; + + const T *row_src = src_ + row * row_size; + T *row_tgt = tgt_ + row * row_size; + const bool row_exists = row < num_rows; + + // Perform scan on one block at a time, keeping track of the total value of + // all blocks processed so far. + for (index_t block_col = 0; block_col < row_size; block_col += 2 * num_threads_x) { + // Load data into shared memory (two values per thread). + index_t col1 = block_col + (index_t) threadIdx.x; + index_t col2 = block_col + num_threads_x + (index_t) threadIdx.x; + if (row_exists) { + if (col1 < row_size) { + row_buf[threadIdx.x] = row_src[col1]; + } else { + row_buf[threadIdx.x] = init; + } + + if (col2 < row_size) { + row_buf[num_threads_x + threadIdx.x] = row_src[col2]; + } else { + row_buf[num_threads_x + threadIdx.x] = init; + } + + // Add the total value of all previous blocks to the first value of this block. + if (threadIdx.x == 0) { + row_buf[0] = binary_op(row_buf[0], block_total); + } + } + __syncthreads(); + + // Parallel reduction with Sklansky method. The diagram can be seen on this paper: + // https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back + for (int m = 0; m <= log_num_threads_x; ++m) { + if (row_exists) { + index_t s = 1 << m; // s = 2 ^ m + auto a = static_cast((threadIdx.x >> m) << (m + 1)) | s; // a = (threadIdx.x / s) * (2 * s) + s + index_t ti = a + (threadIdx.x % s); + index_t si = a - 1; + row_buf[ti] = binary_op(row_buf[ti], row_buf[si]); + } + __syncthreads(); + } + + // Write back to output. + if (row_exists) { + if (col1 < row_size) row_tgt[col1] = row_buf[threadIdx.x]; + if (col2 < row_size) row_tgt[col2] = row_buf[num_threads_x + threadIdx.x]; + } + block_total = row_buf[2 * num_threads_x - 1]; + __syncthreads(); + } + } +} + +template < + typename T, + class BinaryFunction> +__global__ void tensor_kernel_scan_innermost_dim( + T* tgt_, + const T* src_, + const uint32_t num_rows, + const uint32_t row_size, + const uint32_t log_num_threads_x, + T init, + BinaryFunction binary_op) { + alignas(sizeof(double)) extern __shared__ char sbuf[]; + T* sbuf2 = reinterpret_cast(sbuf); + const uint32_t num_threads_x = 1 << log_num_threads_x; + T* row_buf = reinterpret_cast(sbuf2 + num_threads_x * 2 * threadIdx.y); + if (num_rows * (size_t) row_size <= UINT_MAX) { + tensor_kernel_scan_innermost_dim_impl( + row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op); + } else { + tensor_kernel_scan_innermost_dim_impl( + row_buf, tgt_, src_, num_rows, row_size, log_num_threads_x, init, binary_op); + } +} + + +template +__host__ void scan_outer_dim(const TensorBase& self, const TensorBase& result, + int dim, scalar_t init, BinaryFunction binary_op) { + const int64_t row_size = self.size(dim); + auto sizes = self.sizes(); + + // Treat all outer dimensions (i.e. dim_ < dim) as one. + const int64_t num_orows = c10::multiply_integers(sizes.begin(), sizes.begin() + dim); + + // Treat all inner dimensions (i.e. dim > dimension) as one. + const int64_t num_irows = c10::multiply_integers(sizes.begin() + dim + 1, sizes.end()); + + dim3 threads(std::min(512, int(num_irows))); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; + dim3 grid(std::min(maxGridDim, num_orows), std::min(maxGridDim, ceil_div(num_irows, int64_t{threads.x}))); + + check_fits_in_unsigned(num_irows, "num_irows"); + check_fits_in_unsigned(num_orows, "num_orows"); + check_fits_in_unsigned(row_size, "row_size"); + if (static_cast(num_irows) * num_orows * row_size <= UINT_MAX) { + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + } else { + tensor_kernel_scan_outer_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_orows, num_irows, row_size, init, binary_op); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_innermost_dim(const TensorBase& self, const TensorBase& result, + scalar_t init, BinaryFunction binary_op) { + int64_t ndim = self.dim(); + // Treat all outer dimensions as a single dimension. + int64_t row_size = self.size(ndim - 1); + int64_t num_rows = self.numel() / row_size; + + // assuming max_num_threads per block is 512 + const uint32_t num_threads = 512; + const uint32_t log_num_threads_x = get_log_num_threads_x_inner_scan(num_rows, row_size); + const uint32_t num_threads_x = (1 << log_num_threads_x); + const uint32_t num_threads_y = num_threads / num_threads_x; + dim3 threads(num_threads_x, num_threads_y); + int64_t maxGridDim = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; + dim3 grid(std::min(maxGridDim, ceil_div(num_rows, int64_t{threads.y}))); + + check_fits_in_unsigned(num_rows, "Number of rows (self.numel()/self.size(self.dim()-1))"); + check_fits_in_unsigned(row_size, "row_size"); + + tensor_kernel_scan_innermost_dim<<>>( + result.mutable_data_ptr(), self.const_data_ptr(), + num_rows, row_size, log_num_threads_x, init, binary_op); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +void scan_dim(const TensorBase& self, const TensorBase& result, + int64_t dim, scalar_t init, BinaryFunction binary_op) { + int ndim = self.dim(); + auto self_ = self.expect_contiguous(); + TORCH_INTERNAL_ASSERT(result.is_contiguous()); + + if (self.numel() == self.size(dim)) { + if constexpr (std::is_same_v>) { + if (C10_UNLIKELY(at::globalContext().deterministicAlgorithms()) && (self.is_floating_point() || self.is_complex())) { +#if defined(CUDA_VERSION) || defined(USE_ROCM) + cuda::cub::inclusive_deterministic_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); +#else + globalContext().alertNotDeterministic("cumsum_cuda_kernel"); + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); +#endif + } else { + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); + } + } else { + cuda::cub::inclusive_scan(self_->const_data_ptr(), result.mutable_data_ptr(), binary_op, self.numel()); + } + } else if (dim == ndim - 1) { + scan_innermost_dim(*self_, result, init, binary_op); + } else { + scan_outer_dim(*self_, result, dim, init, binary_op); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sort.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sort.h new file mode 100644 index 0000000000000000000000000000000000000000..bcee7d13a3ab300226ab07737fbe9f6aa6773d8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/Sort.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + + +namespace at::native { + +inline bool should_use_small_sort(const TensorBase &self, int64_t dim) { + return self.size(dim) <= 4096; +} + +void sortKeyValueInplace( + const TensorBase &key, const TensorBase &value, int64_t dim, + bool descending, bool stable=false); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortStable.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortStable.h new file mode 100644 index 0000000000000000000000000000000000000000..1c95d04432a28e8d9c15d30e00f93a5b7a1fb4ef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortStable.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +// Stable-sort self into values, and set indices to the +// inverse-permutation from values back to self. +// Output tensors must be pre-allocated and contiguous. +void launch_stable_sort_kernel( + const TensorBase& self, + int64_t dim, + bool descending, + const TensorBase& values, + const TensorBase& indices); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f18c3c56fbddda53efba5c7b8c2065bf3c67229f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortUtils.cuh @@ -0,0 +1,348 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#include +#include +#include +#include +#include +#include +#include + +#define HAS_WARP_MERGE_SORT() (CUDA_VERSION >= 110600) + + +namespace at::native { + +template +__device__ inline void swapVars(T& t1, T& t2) { + T tmp = t1; + t1 = t2; + t2 = tmp; +} + +template +__device__ inline void bitonicSwap(K& kA, V& vA, bool& validA, + K& kB, V& vB, bool& validB, + bool dir, + const Comparator& comp) { + // Invalid entries always sort to the end + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(vA, vB); + swapVars(validA, validB); + } +}; + +template +__device__ inline void bitonicSort(K *keys, + V *values, + bool *valid, + const Comparator& comp) { +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwap( + keys[pos], values[pos], valid[pos], + keys[pos + stride], values[pos + stride], valid[pos + stride], + flag, comp); + } + } + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwap( + keys[pos], values[pos], valid[pos], + keys[pos + stride], values[pos + stride], valid[pos + stride], + false, comp); + } + + __syncthreads(); + +} + +// at::cuda::detail::TensorInfo version +// Sorts (key, value) pairs (in different tensors) in-place; i.e., +// modifies the input `keys` and `values` +template +C10_LAUNCH_BOUNDS_1(block_dim_x * max_block_dim_y) +__global__ void +bitonicSortKVInPlace(at::cuda::detail::TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + at::cuda::detail::TensorInfo values, + IndexType valueSliceStride, + Comparator comp) { + // Find the slice of the tensor that we are sorting + // NOTE: blockDim.y may be less max_block_dim_y + const IndexType blockIndex = getLinearBlockId(); + const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y; + + // If the entire block is out of bounds exit early + if (blockIndex * blockDim.y >= keySlices) { + return; + } + // It's also possible for some rows of a block to be out of bounds + // but all thread need to run for __syncthreads to work. + const bool row_valid = linearIndex < keySlices; + + constexpr int items_per_thread = 2; + constexpr int Power2SortSize = block_dim_x * items_per_thread; + + // Storage for max_block_dim_y sorts performed in parallel + __shared__ K blockSharedKeys[max_block_dim_y][Power2SortSize]; + __shared__ V blockSharedValues[max_block_dim_y][Power2SortSize]; + __shared__ bool blockSharedValid[max_block_dim_y][Power2SortSize]; + + auto sharedKeys = blockSharedKeys[threadIdx.y]; + auto sharedValues = blockSharedValues[threadIdx.y]; + auto sharedValid = blockSharedValid[threadIdx.y]; + + const IndexType keyStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, values); + + // Load 2 values per thread into the shared workspace + #pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + auto idx = threadIdx.x + k * blockDim.x; + bool valid = row_valid && idx < keySliceSize; + + sharedKeys[idx] = valid ? + keys.data[idx * keySliceStride + keyStartOffset] : K{}; + sharedValues[idx] = valid ? + values.data[idx * valueSliceStride + valueStartOffset] : V{}; + sharedValid[idx] = valid; + } + + // Sort! + bitonicSort( + sharedKeys, sharedValues, sharedValid, comp); + + if (!row_valid) { + return; + } + + // Store outputs + #pragma unroll + for (int k = 0; k < items_per_thread; ++k) { + auto idx = threadIdx.x + k * blockDim.x; + if (idx < keySliceSize) { + keys.data[idx * keySliceStride + keyStartOffset] = sharedKeys[idx]; + values.data[idx * valueSliceStride + valueStartOffset] = sharedValues[idx]; + } + } +} + +#if HAS_WARP_MERGE_SORT() + +template +C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE * max_block_dim_y) +__global__ void +warpMergeSortKVInPlace( + at::cuda::detail::TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + at::cuda::detail::TensorInfo values, + IndexType valueSliceStride, + Comparator comp, + K invalid_key) { + // Find the slice of the tensor that we are sorting + // NOTE: blockDim.y may be less max_block_dim_y + const IndexType blockIndex = getLinearBlockId(); + const IndexType linearIndex = blockIndex * blockDim.y + threadIdx.y; + + // If this row is out of bounds exit early + if (linearIndex >= keySlices) { + return; + } + + const IndexType keyStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, values); + + K *keys_slice = &keys.data[keyStartOffset]; + V *values_slice = &values.data[valueStartOffset]; + + StridedRandomAccessor keys_iter(keys_slice, keySliceStride); + StridedRandomAccessor values_iter(values_slice, valueSliceStride); + + namespace cub = ROCM_HIPCUB(at_cuda_detail::cub); + + CUDA_KERNEL_ASSERT(blockDim.x == C10_WARP_SIZE); + CUDA_KERNEL_ASSERT(blockDim.y <= max_block_dim_y); + constexpr int items_per_thread = sort_size / C10_WARP_SIZE; + static_assert( + items_per_thread * C10_WARP_SIZE == sort_size, + "sort_size must be a multiple of C10_WARP_SIZE"); + + + using LoadKeys = cub::WarpLoad; + using LoadValues = cub::WarpLoad; + using Sort = cub::WarpMergeSort; + using StoreKeys = cub::WarpStore; + using StoreValues = cub::WarpStore; + + __shared__ union { + typename LoadKeys::TempStorage load_keys; + typename LoadValues::TempStorage load_values; + typename Sort::TempStorage sort; + typename StoreKeys::TempStorage store_keys; + typename StoreValues::TempStorage store_values; + } tmp_storage[max_block_dim_y]; + + auto& warp_storage = tmp_storage[threadIdx.y]; + + // Load inputs + K local_keys[items_per_thread]; + V local_values[items_per_thread]; + + const auto invalid_value = V{}; + LoadKeys(warp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key); + WARP_SYNC(); + LoadValues(warp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value); + WARP_SYNC(); + + // Sort! We use stable sort to ensure that invalid values are never + // sorted before valid values. In testing it performed the same as + // .Sort, so there is no down-side. + Sort(warp_storage.sort).StableSort( + local_keys, local_values, comp, keySliceSize, invalid_key); + WARP_SYNC(); + + // Store outputs + StoreKeys(warp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize); + WARP_SYNC(); + StoreValues(warp_storage.store_values).Store(values_iter, local_values, keySliceSize); +} + +#endif // HAS_WARP_MERGE_SORT() + +template +C10_LAUNCH_BOUNDS_1(block_size) +__global__ void +radixSortKVInPlace(at::cuda::detail::TensorInfo keys, + IndexType keySlices, + IndexType keySliceSize, + IndexType keySliceStride, + at::cuda::detail::TensorInfo values, + IndexType valueSliceStride, + bool descending) { + static_assert(block_size > 0, ""); + + // Find the slice of the tensor that we are sorting + const IndexType linearIndex = getLinearBlockId(); + // Tiling the slices could have us be out of bounds, if there are a + // lot of slices to sort + if (linearIndex >= keySlices) { + return; + } + + const IndexType keyStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, keys); + const IndexType valueStartOffset = + at::cuda::detail::IndexToOffset::get(linearIndex, values); + + K *keys_slice = &keys.data[keyStartOffset]; + V *values_slice = &values.data[valueStartOffset]; + + StridedRandomAccessor keys_iter(keys_slice, keySliceStride); + StridedRandomAccessor values_iter(values_slice, valueSliceStride); + + namespace cub = ROCM_HIPCUB(at_cuda_detail::cub); + + using key_t = typename at::cuda::cub::detail::cuda_type::type; + using LoadKeys = cub::BlockLoad; + using LoadValues = cub::BlockLoad; + using Sort = cub::BlockRadixSort; + using StoreKeys = cub::BlockStore; + using StoreValues = cub::BlockStore; + + __shared__ union { + typename LoadKeys::TempStorage load_keys; + typename LoadValues::TempStorage load_values; + typename Sort::TempStorage sort; + typename StoreKeys::TempStorage store_keys; + typename StoreValues::TempStorage store_values; + } tmp_storage; + + // cub's Block operations operate on a fixed number of items, but the + // actual slice we are sorting might be smaller. So, we need to make + // up the difference with keys that will always sort higher. + const K invalid_key = [descending] { + using radix_t = typename cub::Traits::UnsignedBits; + union { + K key; + radix_t radix; + } tmp; + tmp.radix = descending ? + cub::Traits::LOWEST_KEY : + cub::Traits::MAX_KEY; + return tmp.key; + }(); + const V invalid_value = static_cast(0); + + // Load inputs + K local_keys[items_per_thread]; + V local_values[items_per_thread]; + + LoadKeys(tmp_storage.load_keys).Load(keys_iter, local_keys, keySliceSize, invalid_key); + __syncthreads(); + LoadValues(tmp_storage.load_values).Load(values_iter, local_values, keySliceSize, invalid_value); + __syncthreads(); + + // Sort! + if (descending) { + Sort(tmp_storage.sort).SortDescending( + reinterpret_cast(local_keys), + local_values); + } else { + Sort(tmp_storage.sort).Sort( + reinterpret_cast(local_keys), + local_values); + } + __syncthreads(); + + // Store outputs + StoreKeys(tmp_storage.store_keys).Store(keys_iter, local_keys, keySliceSize); + __syncthreads(); + StoreValues(tmp_storage.store_values).Store(values_iter, local_values, keySliceSize); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c80afbfdef4593be97846b2d35da1bd622ade0a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/SortingCommon.cuh @@ -0,0 +1,198 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// Is this questionable namespace pollution? +#if defined(USE_ROCM) +constexpr int MAX_BLOCK_SIZE = 256; + +#else +constexpr int MAX_BLOCK_SIZE = 1024; +#endif + +// Maximum size per grid dimension that we assume (compute capability >= 2.0) +constexpr int64_t MAX_GRID_SIZE = 65535LL; + +inline bool getGridFromTiles(int64_t gridTiles, dim3& grid) { + if (gridTiles > MAX_GRID_SIZE * MAX_GRID_SIZE * MAX_GRID_SIZE) { + return false; + } + + int64_t gridX = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + int64_t gridY = 1; + int64_t gridZ = 1; + + if (gridTiles > MAX_GRID_SIZE) { + gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE); + gridY = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + + if (gridTiles > MAX_GRID_SIZE) { + gridTiles = ceil_div(gridTiles, MAX_GRID_SIZE); + gridZ = gridTiles > MAX_GRID_SIZE ? MAX_GRID_SIZE : gridTiles; + } + } + + grid = dim3(gridX, gridY, gridZ); + return true; +} + +template +struct GTOp { + __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const { + return (handleNaN && at::_isnan(lhs) && !at::_isnan(rhs)) || + (static_cast(lhs) > static_cast(rhs)); + } +}; + +template +struct LTOp { + __device__ bool operator()(const scalar_t& lhs, const scalar_t& rhs) const { + return (handleNaN && at::_isnan(rhs) && !at::_isnan(lhs)) || + (static_cast(lhs) < static_cast(rhs)); + } +}; + +template +__device__ __forceinline__ index_t getLinearBlockId() { + return blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + + blockIdx.x; +} + +// For slice sorting in Thrust; extracts a slice index from a linear +// index and uses that for comparison +struct SliceComp { + SliceComp(int64_t size) : sliceSize(size) {} + + __device__ bool operator()(const int64_t& a, const int64_t& b) const { + // Since the slices are guaranteed to be innermost, + // the segment is just via int64_t division + int64_t segA = a / sliceSize; + int64_t segB = b / sliceSize; + return segA < segB; + } + + const int64_t sliceSize; +}; + +// For sorting in Thurst; extracts a within-slice index from a linear index +struct GlobalIndexToPerSliceIndex { + GlobalIndexToPerSliceIndex(int64_t size) : sliceSize(size) {} + + __device__ inline void operator()(int64_t& v) const { + v = v % sliceSize; + } + + const int64_t sliceSize; +}; + +// Returns 2^(ceil(lg(n)) from Stanford bit twiddling hacks +inline uint64_t nextHighestPowerOf2(uint64_t n) { + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; +#ifndef _MSC_VER + n |= n >> 32; +#endif + n++; + + return n; +} + + +// WARNING: This function assumes input tensors are contiguous +template +void run_launcher( + const TensorBase &values, + const TensorBase &indices, + const TensorBase &self, + int64_t dim, + Launcher l) { + auto self_info = cuda::detail::getTensorInfo(self); + auto values_info = cuda::detail::getTensorInfo(values); + auto indices_info = cuda::detail::getTensorInfo(indices); + + int64_t slice_size = self.size(dim); + /* We use these structures solely to find the offset to */ + /* each slice we are operating on */ + self_info.reduceDim(dim); + values_info.reduceDim(dim); + indices_info.reduceDim(dim); + + /* Collapse all other dims */ + int collapse_self_dim = self_info.collapseDims(dim); + int collapse_values_dim = values_info.collapseDims(dim); + int collapse_indices_dim = indices_info.collapseDims(dim); + + int64_t num_slices = 1; + for (int i = 0; i < self_info.dims; ++i) { + num_slices *= self_info.sizes[i]; + } + + /* This is used as a template parameter to calculate indices. */ + /* We only specialize it if all collapsed dim sizes are the */ + /* same; otherwise, we use -1 which is the specialization */ + /* parameter for arbitrary dimensions */ + int all_dims = self_info.dims; + if (values_info.dims != all_dims || indices_info.dims != all_dims) { + all_dims = -1; + } + + if (all_dims == 1) { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } else if (all_dims == 2) { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } else if (all_dims == 3) { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } else { + l.template launch( + values_info, + collapse_values_dim, + indices_info, + collapse_indices_dim, + self_info, + collapse_self_dim, + num_slices, + slice_size); + } +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorTopK.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorTopK.h new file mode 100644 index 0000000000000000000000000000000000000000..01ae160f962d24e622663d1ed16489ed147c6c44 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/TensorTopK.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at::native { +void launch_gather_topk_kernel( + const TensorBase& self, + int64_t k, int64_t dim, bool largest, + const TensorBase& values, const TensorBase& indices); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UpSample.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UpSample.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ca43947121b4f085038d2fc652786df8fd31ba79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/UpSample.cuh @@ -0,0 +1,373 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +#include +#include +#include + +#include +#include + +namespace at::native { + +namespace upsample { +// TODO: Remove duplicate declaration. +TORCH_API c10::SmallVector compute_output_size( + c10::IntArrayRef input_size, // Full input tensor size. + at::OptionalIntArrayRef output_size, + std::optional> scale_factors); +} // namespace upsample + +namespace upsample_cuda { + +// TODO: Remove duplication with Upsample.h (CPU). +inline std::optional get_scale_value(std::optional> scales, int idx) { + if (!scales) { + return std::nullopt; + } + return scales->at(idx); +} + +} // namespace upsample_cuda + + +/* TODO: move this to a common place */ +template +__device__ inline scalar_t min(scalar_t a, scalar_t b) { + return a < b ? a : b; +} + +template +__device__ inline scalar_t max(scalar_t a, scalar_t b) { + return a > b ? a : b; +} + +// NOTE [ Nearest neighbor upsampling kernel implementation ] +// +// The nearest neighbor upsampling kernel implementation is symmetrical as +// expected. We launch kernels with threads mapping to destination tensors where +// kernels write data to, each thread reads data from the source tensor, this +// means: +// 1. In the forward kernel, +// src_xxx refers to properties of input tensors; +// dst_xxx refers to properties of output tensors; +// scale_factor is the ratio of src_size to dst_size; +// 2. In the backward kernel, +// src_xxx refers to properties of grad_output tensors; +// dst_xxx refers to properties of grad_input tensors; +// scale_factor is the ratio of src_size to dst_size; +// +// Because of this, we need to take the reciprocal of the scale defined by +// upsample layer during forward path. The motivation is to avoid slow +// division in the kernel code, so we can use faster multiplication instead. +// This is not necessary during backward path, since the scale_factor is already +// the reciprocal of corresponding scale_factor used in the forward path due to +// the swap of source and destination tensor. +// +// Similarly, since the mapping from grad_input to grad_output during backward +// is the reverse of the mapping of output to input, we need to have opposite +// mapping functions to compute the source index. + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +template +__host__ __forceinline__ accscalar_t compute_scales_value( + const std::optional scale, + int64_t src_size, + int64_t dst_size) { + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value()) + : (accscalar_t)src_size / dst_size; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +template +__host__ __forceinline__ accscalar_t compute_scales_value_backwards( + const std::optional scale, + int64_t src_size, + int64_t dst_size) { + // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults. + return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value() + : (accscalar_t)src_size / dst_size; +} + +template +__host__ __forceinline__ accscalar_t area_pixel_compute_scale( + int input_size, + int output_size, + bool align_corners, + const std::optional scale) { + if(align_corners) { + if(output_size > 1) { + return (accscalar_t)(input_size - 1) / (output_size - 1); + } + else { + return static_cast(0); + } + } + else{ + return compute_scales_value(scale, input_size, output_size); + } +} + +template +__device__ __forceinline__ accscalar_t area_pixel_compute_source_index( + accscalar_t scale, + int dst_index, + bool align_corners, + bool cubic) { + if (align_corners) { + return scale * dst_index; + } else { + accscalar_t src_idx = scale * (dst_index + static_cast(0.5)) - + static_cast(0.5); + // See Note[Follow Opencv resize logic] + return (!cubic && src_idx < static_cast(0)) + ? static_cast(0) + : src_idx; + } +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ int nearest_neighbor_compute_source_index( + const float scale, + int dst_index, + int input_size) { + // index_f32 = (output_index) * scale + // input_index = round(index_f32) + // Same as a buggy OpenCV INTER_NEAREST + // We keep this method for BC and consider as deprecated. + // See nearest_neighbor_exact_compute_source_index as replacement + const int src_index = + min(static_cast(floorf((dst_index) * scale)), input_size - 1); + return src_index; +} + +__device__ __forceinline__ int nearest_neighbor_exact_compute_source_index( + const float scale, + int dst_index, + int input_size) { + // index_f32 = (output_index + 0.5) * scale - 0.5 + // input_index = round(index_f32) + // Same as Pillow and Scikit-Image/Scipy ndi.zoom + const int src_index = + min(static_cast(floorf((dst_index + static_cast(0.5)) * scale)), input_size - 1); + return src_index; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ int nearest_neighbor_bw_compute_source_index( + const float scale, + int dst_index, + int output_size) { + // Equivalent to buggy OpenCV INTER_NEAREST + // We keep this method for BC and consider as deprecated. + // See nearest_neighbor_exact_bw_compute_source_index as replacement + const int src_index = + min(static_cast(ceilf(dst_index * scale)), output_size); + return src_index; +} + +// see NOTE [ Nearest neighbor upsampling kernel implementation ] +__device__ __forceinline__ int nearest_neighbor_exact_bw_compute_source_index( + const float scale, + int dst_index, + int output_size) { + // Equivalent to Pillow and Scikit-Image/Scipy ndi.zoom + const int src_index = + min(static_cast(ceilf(dst_index * scale - static_cast(0.5))), output_size); + return src_index; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ scalar_t upsample_get_value_bounded( + const PackedTensorAccessor64& data, + int batch, + int channel, + int height, + int width, + int y, + int x) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + return data[batch][channel][access_y][access_x]; +} + +/* Used by UpSampleBicubic2d.cu */ +template +__device__ __forceinline__ void upsample_increment_value_bounded( + PackedTensorAccessor64& data, + int batch, + int channel, + int height, + int width, + int y, + int x, + accscalar_t value) { + int access_y = max(min(y, height - 1), 0); + int access_x = max(min(x, width - 1), 0); + /* TODO: result here is truncated to scalar_t, + check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912 + */ + gpuAtomicAddNoReturn( + &data[batch][channel][access_y][access_x], static_cast(value)); +} + +// Based on +// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm +template +__device__ __forceinline__ accscalar_t cubic_convolution1( + accscalar_t x, + accscalar_t A) { + return ((A + 2) * x - (A + 3)) * x * x + 1; +} + +template +__device__ __forceinline__ accscalar_t cubic_convolution2( + accscalar_t x, + accscalar_t A) { + return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A; +} + +template +__device__ __forceinline__ void get_cubic_upsampling_coefficients( + accscalar_t coeffs[4], + accscalar_t t) { + accscalar_t A = -0.75; + + accscalar_t x1 = t; + coeffs[0] = cubic_convolution2(x1 + 1.0, A); + coeffs[1] = cubic_convolution1(x1, A); + + // opposite coefficients + accscalar_t x2 = 1.0 - t; + coeffs[2] = cubic_convolution1(x2, A); + coeffs[3] = cubic_convolution2(x2 + 1.0, A); +} + +template +__device__ __forceinline__ accscalar_t cubic_interp1d( + scalar_t x0, + scalar_t x1, + scalar_t x2, + scalar_t x3, + accscalar_t t) { + accscalar_t coeffs[4]; + get_cubic_upsampling_coefficients(coeffs, t); + + return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3]; +} + +namespace upsample_antialias { + +// taken from +// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ +// src/libImaging/Resample.c#L20-L29 +struct BilinearFilterFunctor { + + template + __device__ accscalar_t operator()(accscalar_t x) const { + if (x < 0) { + x = -x; + } + if (x < 1) { + return 1 - x; + } + return 0; + } + + static constexpr int size = 2; +}; + +// taken from +// https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/ +// src/libImaging/Resample.c#L46-L62 +struct BicubicFilterFunctor { + + template + __device__ accscalar_t operator()(accscalar_t x) const { + // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm + const accscalar_t a = -0.5; + if (x < 0) { + x = -x; + } + if (x < 1) { + return ((a + 2) * x - (a + 3)) * x * x + 1; + } + if (x < 2) { + return (((x - 5) * x + 8) * x - 4) * a; + } + return 0; + } + + static constexpr int size = 4; +}; + +template +__device__ __forceinline__ void _compute_weights_span( + const int i, + const int input_size, + const accscalar_t scale, + const accscalar_t support, + int& xmin, + int& xsize, + accscalar_t& center) { + center = scale * (i + static_cast(0.5)); + xmin = max(static_cast(center - support + static_cast(0.5)), static_cast(0)); + xsize = min(static_cast(center + support + static_cast(0.5)), input_size) - xmin; +} + +template +__device__ __forceinline__ void _compute_weights( + scalar_t* wt_ptr, + const accscalar_t scale, + int interp_size, + const interp_filter_t& interp_filter, + accscalar_t xmin_m_center, + int xsize) { + + accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0; + accscalar_t total_w = 0.0; + int j = 0; + for (j = 0; j < xsize; j++) { + accscalar_t w = interp_filter((j + xmin_m_center + static_cast(0.5)) * invscale); + wt_ptr[j] = static_cast(w); + total_w += w; + } + for (j = 0; j < xsize; j++) { + if (total_w != 0.0) { + wt_ptr[j] /= total_w; + } + } + for (; j < interp_size; j++) { + wt_ptr[j] = static_cast(0.0); + } +} + +template +__device__ __forceinline__ accscalar_t interpolate_aa_single_dim( + const scalar_t* src, + const scalar_t* weights, + int size) { + scalar_t t = static_cast(*src); + scalar_t wts = static_cast(weights[0]); + accscalar_t output = t * wts; + + int j = 1; + for (; j < size; j++) { + wts = static_cast(weights[j]); + t = static_cast(*(src + j)); + output += t * wts; + } + return output; +} + +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh new file mode 100644 index 0000000000000000000000000000000000000000..781567459af04a897b25217bdc5e8096c26a0190 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/block_reduce.cuh @@ -0,0 +1,152 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native::cuda_utils { + +constexpr int kCUDABlockReduceNumThreads = 512; +// Algorithmic limitation: BlockReduce does two WarpReduce calls, each +// of which reduces C10_WARP_SIZE elements. So, at most +// C10_WARP_SIZE**2 elements can be reduced at a time. +// NOTE: This is >= the max block size on current hardware anyway (1024). +// ROCm NOTE: C10_WARP_SIZE should only be used inside device functions, +// and kCUDABlockReduceMaxThreads is a host-side variable. +#ifdef USE_ROCM +static int kCUDABlockReduceMaxThreads() { + return at::cuda::warp_size() * at::cuda::warp_size(); +} +#else +constexpr int kCUDABlockReduceMaxThreads() { + return C10_WARP_SIZE * C10_WARP_SIZE; +} +#endif + +// Sums `val` across all threads in a warp. +// +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +template +__inline__ __device__ T WarpReduceSum(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val += WARP_SHFL_DOWN(val, offset); + } + return val; +} + +// Picks the maximum `val` across all threads in a warp. +// +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +template +__inline__ __device__ T WarpReduceMax(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val = max_propagate_nan(val, WARP_SHFL_DOWN(val, offset)); + } + return val; +} + +struct Block1D { + static __forceinline__ __device__ int Tid() { return threadIdx.x; } + + static __forceinline__ __device__ int Warps() { + return blockDim.x / C10_WARP_SIZE; + } +}; + +struct Block2D { + static __forceinline__ __device__ int Tid() { + return threadIdx.x + threadIdx.y * blockDim.x; + } + + static __forceinline__ __device__ int Warps() { + return blockDim.x * blockDim.y / C10_WARP_SIZE; + } +}; + +// Sums `val` across all threads in a block. +// +// Warning: the return value is only valid for thread 0. +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +// - `shared` should be a pointer to shared memory with size of, at least, +// `sizeof(T) * number_of_warps` +template +__inline__ __device__ T BlockReduceSum(T val, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduceSum(val); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceSum(val); + } + return val; +} + +// Picks out the maximum `val` across all threads in a block. +// +// Warning: the return value is only valid for thread 0. +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +// - `shared` should be a pointer to shared memory with size of, at least, +// `sizeof(T) * number_of_warps` +template +__inline__ __device__ T BlockReduceMax(T val, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduceMax(val); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : T(std::numeric_limits::lowest()); + if (wid == 0) { + val = WarpReduceMax(val); + } + return val; +} + +template +__inline__ __device__ T WarpReduce(T val, const ReduceOp& op) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val = op.combine(val, op.warp_shfl_down(val, offset)); + } + return val; +} + +template +__inline__ __device__ T +BlockReduce(T val, const ReduceOp& op, const T& identity_element, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduce(val, op); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : identity_element; + if (wid == 0) { + val = WarpReduce(val, op); + } + return val; +} + +} // namespace at::native::cuda_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cuBlasCommonArgs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cuBlasCommonArgs.h new file mode 100644 index 0000000000000000000000000000000000000000..196cdbfb62bde97e8dc8dc577d7a31062b6d71ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/cuBlasCommonArgs.h @@ -0,0 +1,176 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +using at::blas::ScalingType; +using at::blas::SwizzleType; + +namespace { + +// TODO: https://github.com/pytorch/pytorch/pull/59380#pullrequestreview-725310492 +c10::MaybeOwned inline resolve_conj_if_indicated(const Tensor& tensor, bool resolve_conj) { + if (resolve_conj && tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor, bool transpose_result) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, transpose_result ? transpose_tensor : !transpose_tensor); + } + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, !transpose_result); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, transpose_result); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + +c10::MaybeOwned inline prepare_matrix_for_cublas(const Tensor& tensor, bool& transpose_tensor) { + if (tensor.is_non_overlapping_and_dense()) { // common case + transpose_tensor = tensor.is_contiguous(); + return resolve_conj_if_indicated(tensor, true); + } + + IntArrayRef tensor_strides = tensor.strides(); + IntArrayRef tensor_sizes = tensor.sizes(); + if ((tensor_strides[0] == 1) && (tensor_strides[1] >= std::max(1, tensor_sizes[0]))) { + transpose_tensor = false; + return resolve_conj_if_indicated(tensor, true); + } else if ((tensor_strides[1] == 1) && (tensor_strides[0] >= std::max(1, tensor_sizes[1]))) { + transpose_tensor = true; + return resolve_conj_if_indicated(tensor, true); + } else { + transpose_tensor = true; + return c10::MaybeOwned::owned(tensor.clone(at::MemoryFormat::Contiguous)); + } +} + +} // namespace + +/** + * @brief Prepares matrices for CUBLAS operation + * + * This constructor prepares tensors for CUBLAS + * The main difference is that PyTorch uses row-major as the default and + * CUBLAS expects column-major. + * + * @details + * To enable row-major output while using CUBLAS, + * we use the mathematical identity that (A × B)^T = B^T × A^T. + * + * Transpose in this context refers to Cublas's(Fortran) definition of transpose (row-major) + * T = row-major, N = col-major + * + * Example: + * For matrices A (M×K)(row-major) and B (K×N)(row-major): + * - Standard multiplication: A × B = (M×K) × (K×N) = M×N result (row-major) + * - Using our transpose trick: (B^T × A^T) = (N×K)(T) × (K×M)(T) = N×M(N) + * - However, since the output form cublas is column-major this is + * - equivalent to an output of size MxN row-major as expected + * + * The transpose flags are derived from the layouts of the passed in tensors + * + * If the operands are in packed float4 format, `k`, `lda` and `ldb` are adjusted + * to their unpacked values to match what cuBLAS expects. + * + * @param mat1 First input matrix + * @param mat2 Second input matrix + * @param c Output matrix (result) + * @param scale_a Optional scaling factor for first matrix + * @param scale_b Optional scaling factor for second matrix + * @param scale_result Optional scaling factor for result + */ +struct cublasCommonArgs { + cublasCommonArgs( + const Tensor& mat1, + const Tensor& mat2, + Tensor& c, + const std::optional& scale_a = std::nullopt, + const std::optional& scale_b = std::nullopt, + const std::optional& scale_result = std::nullopt, + const std::optional& scaling_choice_a = std::nullopt, + const std::optional& scaling_choice_b = std::nullopt) { + bool transpose_result = false, transpose_a = false, transpose_b = false; + result = prepare_matrix_for_cublas(c, transpose_result); + mata = prepare_matrix_for_cublas(transpose_result ? mat2 : mat1, transpose_a, transpose_result); + matb = prepare_matrix_for_cublas(transpose_result ? mat1 : mat2, transpose_b, transpose_result); + + // Handle scale tensors if provided + if (scale_a && scale_b) { + // By default since we return in row-major we run the gemm + // as B.T @ A.T, check transpose_result to determine if we flip the scales + scale_mata_ptr = transpose_result ? scale_b->data_ptr() : scale_a->data_ptr(); + scale_mata_dtype = transpose_result ? scale_b->scalar_type() : scale_a->scalar_type(); + scaling_mata_type = transpose_result ? scaling_choice_b : scaling_choice_a; + scale_matb_ptr = transpose_result ? scale_a->data_ptr() : scale_b->data_ptr(); + scale_matb_dtype = transpose_result ? scale_a->scalar_type() : scale_b->scalar_type(); + scaling_matb_type = transpose_result ? scaling_choice_a : scaling_choice_b; + } + + if (scale_result) { + scale_result_ptr = scale_result->data_ptr(); + scale_result_dtype = scale_result->scalar_type(); + } + + // Update transpose flags + if (transpose_result) { + transpose_a = !transpose_a; + transpose_b = !transpose_b; + } + + auto sizes_a = mata->sizes(); + auto sizes_b = matb->sizes(); + + m = sizes_a[transpose_result ? 1 : 0]; + k = sizes_a[transpose_result ? 0 : 1]; + n = sizes_b[transpose_result ? 0 : 1]; + lda = mata->stride((transpose_a == transpose_result) ? 1 : 0); + ldb = matb->stride((transpose_b == transpose_result) ? 1 : 0); + result_ld = result->stride(transpose_result ? 0 : 1); + transa = transpose_a ? mata->is_conj() ? 'c' : 't' : 'n'; + transb = transpose_b ? matb->is_conj() ? 'c' : 't' : 'n'; + + // cuBLAS expects unpacked values of `k`, `lda` and `ldb`, adjust for 4x2 packing + // if the gemm operands are in packed float4 + if (mat1.dtype() == at::kFloat4_e2m1fn_x2 && mat2.dtype() == at::kFloat4_e2m1fn_x2) { + k = k * 2; + lda = lda * 2; + ldb = ldb * 2; + } + } + + // Matrix members + char transa, transb; + int64_t m, n, k; + int64_t lda, ldb, result_ld; + c10::MaybeOwned mata, matb, result; + + // Scale members + void* scale_mata_ptr = nullptr; + void* scale_matb_ptr = nullptr; + void* scale_result_ptr = nullptr; + std::optional scale_mata_dtype; + std::optional scaling_mata_type; + std::optional scale_matb_dtype; + std::optional scaling_matb_type; + std::optional scale_result_dtype; +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_utils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_utils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..0c77db3160f9d2828d8c578d3e9dee22d810d020 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adagrad_utils.cuh @@ -0,0 +1,141 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +namespace at::native { + +namespace { + +constexpr uint8_t kParamIdx = 0; +constexpr uint8_t kGradIdx = 1; +constexpr uint8_t kStateSumIdx = 2; + +template +C10_DEVICE inline void adagrad_math( + scalar_t r_args[3][kILP], + const double& corrected_lr, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { +#pragma unroll + for (int ii = 0; ii < kILP; ++ii) { + opmath_t param = static_cast(r_args[kParamIdx][ii]); + opmath_t grad = static_cast(r_args[kGradIdx][ii]); + opmath_t state_sum = static_cast(r_args[kStateSumIdx][ii]); + + if (grad_scale_ptr) { + grad /= (static_cast(*grad_scale_ptr)); + } + const opmath_t grad_to_store = grad; + if (maximize) { + grad = -grad; + } + if (weight_decay != 0) { + grad += param * weight_decay; // Can I change this to use std::fma? + } + state_sum += grad * grad; // Can I change this to use std::fma? + param = param - corrected_lr * grad / (std::sqrt(state_sum) + eps); + + r_args[kParamIdx][ii] = param; + if (grad_scale_ptr) { + r_args[kGradIdx][ii] = grad_to_store; + } + r_args[kStateSumIdx][ii] = state_sum; + } +} + +template +struct FusedAdagradMathFunctor { + using opmath_t = at::opmath_type; + + C10_DEVICE __forceinline__ void operator()( + int64_t chunk_size, + FusedOptimizerTensorListMetadata<3>& tl, + const float* lr_ptr, + const double& lr, + const double& lr_decay, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + const double lr_double = lr_ptr ? *lr_ptr : lr; + + if (found_inf_ptr && *found_inf_ptr == 1) { + return; + } + + const auto corrected_lr = [&]() -> double { + auto* step_count = + reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto denom = 1 + (*step_count - 1) * lr_decay; + const auto corrected_lr = lr_double / denom; + return corrected_lr; + }(); + + scalar_t* args[3]; + scalar_t r_args[3][kILP]; + const auto n = tl.numel_for_tensor[tensor_loc] - + static_cast(chunk_idx * chunk_size); + + const bool all_aligned{ + init_args<3>(args, tl, chunk_idx, chunk_size, tensor_loc)}; + + if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { + load_store(r_args[kParamIdx], args[kParamIdx], 0, i_start); + load_store(r_args[kGradIdx], args[kGradIdx], 0, i_start); + load_store(r_args[kStateSumIdx], args[kStateSumIdx], 0, i_start); + + adagrad_math( + r_args, + corrected_lr, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr); + + load_store(args[kParamIdx], r_args[kParamIdx], i_start, 0); + load_store(args[kStateSumIdx], r_args[kStateSumIdx], i_start, 0); + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args<3>(r_args, args, i_start, chunk_size, n); + + adagrad_math( + r_args, + corrected_lr, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr); + +#pragma unroll + for (int i = 0; i < 3; i++) { + if (i != kGradIdx || grad_scale_ptr) { + store_args(args[i], r_args[i], i_start, chunk_size, n); + } + } + } + } + } +}; + +} // namespace + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..b8156fb622b440f051e4bb14a745ab0fe32aa93a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..eb22658100a76df94ba2fbabe69d5882a8e168b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adam_utils.cuh @@ -0,0 +1,205 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +namespace at::native { + +enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 }; + +namespace { + +constexpr uint8_t kParamIdx = 0; +constexpr uint8_t kGradIdx = 1; +constexpr uint8_t kExpAvgIdx = 2; +constexpr uint8_t kExpAvgSqIdx = 3; +constexpr uint8_t kMaxExpAvgSqIdx = 4; + +template < + typename scalar_type, + typename opmath_t, + int depth, + ADAM_MODE adam_mode, + bool amsgrad> +C10_DEVICE inline void adam_math( + scalar_type r_args[depth][kILP], + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr, + const opmath_t& bias_correction1, + const opmath_t& bias_correction2_sqrt) { + static_assert(depth == 4 || depth == 5); +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + // Load values. + opmath_t param = static_cast(r_args[kParamIdx][ii]); + opmath_t grad = static_cast(r_args[kGradIdx][ii]); + if (grad_scale_ptr) { + grad /= (static_cast(*grad_scale_ptr)); + } + const opmath_t grad_to_store = grad; + if (maximize) { + grad = -grad; + } + opmath_t exp_avg = static_cast(r_args[kExpAvgIdx][ii]); + opmath_t exp_avg_sq = static_cast(r_args[kExpAvgSqIdx][ii]); + opmath_t max_exp_avg_sq; + if (amsgrad) { + max_exp_avg_sq = static_cast(r_args[kMaxExpAvgSqIdx][ii]); + } + // Update param, grad, 1st and 2nd order momentum. + if (weight_decay != 0) { + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad += param * weight_decay; + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param -= lr * weight_decay * param; + } + } + // todo(crcrpar): use lerp + // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/ + exp_avg = beta1 * exp_avg + (1 - beta1) * grad; + exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad; + const opmath_t step_size = lr / bias_correction1; + opmath_t denom; + if (amsgrad) { + max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq); + denom = (std::sqrt(max_exp_avg_sq) / bias_correction2_sqrt) + eps; + } else { + denom = (std::sqrt(exp_avg_sq) / bias_correction2_sqrt) + eps; + } + param -= step_size * exp_avg / denom; + + // Store results. + r_args[kParamIdx][ii] = param; + if (grad_scale_ptr) { + r_args[kGradIdx][ii] = grad_to_store; + } + r_args[kExpAvgIdx][ii] = exp_avg; + r_args[kExpAvgSqIdx][ii] = exp_avg_sq; + if (amsgrad) { + r_args[kMaxExpAvgSqIdx][ii] = max_exp_avg_sq; + } + } +} + +// [note: Conditional Gradient Store when `optimizer.step` is called by +// GradScaler] When a user is training their model(s) with an FP16 AMP recipe, +// parameter updates are done via `grad_scaler.step(optimizer)` instead of +// `optimizer.step()`. For most optimizers, GradScaler unscales gradients on +// behalf of those optimizers. Also, before `.step`, it makes sure that all the +// gradients involved are finite, which incurs a device sync. On the other hand, +// fused optimizers set their member variable of `_step_supports_amp_scaling` to +// `True` in order to remove the device sync above. This means that fused +// optimizers have to have their CUDA kernels (a) unscale gradients and (b) skip +// parameter updates accordingly. To be functionally on par with `torch.optim` +// optimizers and `_multi_tensor` ones, the kernel below writes out gradients +// only when `grad_scale_ptr != nullptr. +template +struct FusedAdamMathFunctor { + static_assert( + depth == 4 || depth == 5, + "depth of 4 for Adam, depth of 5 for Adam with AMSGrad."); + using opmath_t = at::opmath_type; + C10_DEVICE __forceinline__ void operator()( + int64_t chunk_size, + FusedOptimizerTensorListMetadata& tl, + const float* lr_ptr, + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, + const float* grad_scale_ptr, + const float* found_inf_ptr) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + const double lr_double = lr_ptr ? *lr_ptr : lr; + + if (found_inf_ptr && *found_inf_ptr == 1) { + return; + } + const auto [bias_correction1, bias_correction2_sqrt] = + [&]() -> std::pair { + auto* step_count = + reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count); + const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count); + const auto bias_correction2_sqrt = std::sqrt(bias_correction2); + return {bias_correction1, bias_correction2_sqrt}; + }(); + + scalar_type* args[depth]; + scalar_type r_args[depth][kILP]; + const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size; + + const bool all_aligned{ + init_args(args, tl, chunk_idx, chunk_size, tensor_loc)}; + if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) { + for (int64_t i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (int i = 0; i < depth; i++) { + load_store(r_args[i], args[i], 0, i_start); + } + adam_math( + r_args, + lr_double, + beta1, + beta2, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr, + bias_correction1, + bias_correction2_sqrt); +#pragma unroll + for (int i = 0; i < depth; i++) { + if (i != kGradIdx || grad_scale_ptr) { + load_store(args[i], r_args[i], i_start, 0); + } + } + } + } else { + for (int64_t i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); + adam_math( + r_args, + lr_double, + beta1, + beta2, + weight_decay, + eps, + maximize, + grad_scale_ptr, + found_inf_ptr, + bias_correction1, + bias_correction2_sqrt); +#pragma unroll + for (int i = 0; i < depth; i++) { + if (i != kGradIdx || grad_scale_ptr) { + store_args(args[i], r_args[i], i_start, chunk_size, n); + } + } + } + } + } +}; +} // namespace + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4a94840cb7c15f5449b1285a4dd1f65a60ded2bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/jit_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/jit_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..48aca14167fd999df98adbe6805bad3ea40cf63c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/jit_utils.h @@ -0,0 +1,254 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include + +namespace at::cuda::jit { + +enum class BinaryFuncVariant {NoScalar, RhsScalar, LhsScalar}; + +struct NvrtcFunction { + CUmodule module = CUmodule(); + CUfunction function = nullptr; +}; + +struct KernelDescriptor { + std::string name; + std::string f; + c10::ScalarType f_inputs_type; + c10::ScalarType result_type; + c10::SmallVector extra_args_types; + int nInputs, nOutputs; +}; + +// Helper function to return a vector +// corresponding to the type of the arguments in parameter pack. +template +c10::SmallVector get_extra_args_types() { + return {c10::CppTypeToScalarType::value ...}; +} + +template < + typename result_type, + typename f_inputs_type, + typename... ExtraArgs> +KernelDescriptor make_kernel_descriptor( + std::string name, + std::string f, + int nInputs, + int nOutputs) { + KernelDescriptor ret; + ret.name = std::move(name); + ret.f = std::move(f); + ret.f_inputs_type = c10::CppTypeToScalarType::value; + ret.result_type = c10::CppTypeToScalarType::value; + ret.extra_args_types = get_extra_args_types(); + ret.nInputs = nInputs; + ret.nOutputs = nOutputs; + return ret; +} + +inline int can_vectorize_up_to(size_t default_alignment, void *pointer) { + auto ip = reinterpret_cast(pointer); +#ifdef USE_ROCM + if ((default_alignment == 1) && (ip % (16 * default_alignment) == 0)) { + return 16; + } + if ((default_alignment <= 2) && (ip % (8 * default_alignment) == 0)) { + return 8; + } +#else + if (ip % (8 * default_alignment) == 0) { + return 8; + } +#endif + if (ip % (4 * default_alignment) == 0) { + return 4; + } + if (ip % (2 * default_alignment) == 0) { + return 2; + } + return 1; +} + +inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef pointers) { + TORCH_INTERNAL_ASSERT(desc.nOutputs == 1); + TORCH_INTERNAL_ASSERT(static_cast(pointers.size()) == 1 + desc.nInputs); + + // Deals with output + auto result_size = c10::scalarTypeToTypeMeta(desc.result_type).itemsize(); + auto result = can_vectorize_up_to(result_size, pointers[0]); + + // Incorporates input(s) + auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize(); + for (auto i : c10::irange(1, pointers.size())) { + result = std::min(result, can_vectorize_up_to(input_size, pointers[i])); + } + + return result; +} + +//FIXME - this are defined in Loops.cuh, but including Loops.cuh here would lead to circular includes Loops.cuh -> CUDALoops.cuh -> jit_utils.h -> Loops.cuh +#ifdef USE_ROCM +#define JIT_THREAD_WORK_SIZE 4 +#else +#define JIT_THREAD_WORK_SIZE 8 +#endif + +int calc_io_size( + const int nInputs, + const int nOutputs, + const c10::ScalarType& inputs_type, + const c10::ScalarType& result_type); + +int calc_thread_work_size( + const int nInputs, + const int nOutputs, + const c10::ScalarType& inputs_type, + const c10::ScalarType& result_type); + +std::string generate_code( + int nInputs, + int nOutputs, + const std::string& func, + const std::string& name, + const std::string& f_inputs_type, + const std::string& compute_type, + const std::string& result_type, + bool contiguous, + bool dynamic_casting, + BinaryFuncVariant scalar_pos, + c10::SmallVector& extra_args_typenames, + int thread_work_size=JIT_THREAD_WORK_SIZE, + bool vectorized=false, + int vec_size=0, + bool return_by_ref=false); + +std::string generate_code( + const KernelDescriptor &desc, + bool contiguous, + bool dynamic_casting, + BinaryFuncVariant scalar_pos, + int thread_work_size=JIT_THREAD_WORK_SIZE, + bool vectorized=false, + int vec_size=0, + bool return_by_ref=false); + +std::string generate_reduction_code( + int nOutputs, + const std::string& func, + const std::string& name, + const int vt0, + const std::string& f_inputs_type, + const std::string& reduction_accum_type, + const std::string& result_type, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + +std::string generate_reduction_code( + const KernelDescriptor &desc, + const int vt0, + bool contiguous, + bool vectorized, + int vec_size, + int max_threads_codegen); + +NvrtcFunction jit_pwise_function( + const std::string& code, + const std::string& kernel_name); + +void launch_jitted_pwise_function( + NvrtcFunction function, + const void* args[], + const dim3 nBlocks, + const dim3 kBlockSize, + const int smem=0); + +template +struct delayed_false : std::false_type { +}; + +// Defines type names +// NOTE: General case is instantiated only for invalid types. +// All the valid types have specialization using the TYPE_NAME_FN +// macro below. +template +inline std::string typeName() { + // we can't use static_assert(false) directly as the + // program will be not compiled even if the template is not + // instantiated, so we use `delayed_false` + // to make sure compiler doesn't eagerly raise + // fail this assertion. + static_assert(delayed_false::value, "invalid type for jiterator"); + return "void"; +} + +#define TYPE_NAME_FN(ctype, name) \ +template <> inline std::string typeName(){ \ + return std::string(#ctype); \ +} + +AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN) +#undef TYPE_NAME_FN +// JIT uses std::complex directly, because nvRTC compile programs +// with -default-device, so there is no such issue like: +// "std::sin(complex) is __host__ only" +template <> inline std::string typeName(){ + return "bool"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName>(){ + return "std::complex"; +} +template <> inline std::string typeName(){ + return "at::Half"; +} +template <> inline std::string typeName(){ + return "at::BFloat16"; +} +template <> inline std::string typeName(){ + return "at::Float8_e5m2"; +} +template <> inline std::string typeName(){ + return "at::Float8_e4m3fn"; +} +template <> inline std::string typeName() { + return "at::Float8_e5m2fnuz"; +} +template <> inline std::string typeName() { + return "at::Float8_e4m3fnuz"; +} +template <> inline std::string typeName() { + // TODO(#146647): Can the code here be made generic for any scalartype? + return "at::Float8_e8m0fnu"; +} + +#define TYPE_NAME_CASE(ctype, scalartype) \ + case ScalarType::scalartype: return typeName(); +inline std::string typeName(ScalarType t) { + switch (t) { + AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(TYPE_NAME_CASE) + default: + TORCH_CHECK(false, "invalid type for jiterator"); + } +} +#undef TYPE_NAME_CASE + +TORCH_CUDA_CPP_API void initializeCudaContext(); + +} // namespace at::cuda::jit + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/vol2col.cuh b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/vol2col.cuh new file mode 100644 index 0000000000000000000000000000000000000000..30cc898d11e7d836144b61e6e12a308cec60673d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/cuda/vol2col.cuh @@ -0,0 +1,267 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +#include + +namespace at::native { + +using namespace at::cuda::detail; + +// Kernel for fast unfold+copy on volumes +template +C10_LAUNCH_BOUNDS_1(1024) +__global__ void vol2col_kernel( + const int64_t n, + const T* data_vol, + const int depth, + const int height, + const int width, + const int ksize_t, + const int ksize_h, + const int ksize_w, + const int pad_t, + const int pad_h, + const int pad_w, + const int stride_t, + const int stride_h, + const int stride_w, + const int dilation_t, + const int dilation_h, + const int dilation_w, + const int depth_col, + const int height_col, + const int width_col, + T* data_col) { + CUDA_KERNEL_LOOP_TYPE(index, n, int64_t) { + auto w_out = index % width_col; + index /= width_col; + auto h_out = index % height_col; + index /= height_col; + auto t_out = index % depth_col; + auto channel_in = index / depth_col; + auto channel_out = channel_in * ksize_t * ksize_h * ksize_w; + auto t_in = t_out * stride_t - pad_t; + auto h_in = h_out * stride_h - pad_h; + auto w_in = w_out * stride_w - pad_w; + data_col += + ((channel_out * depth_col + t_out) * height_col + h_out) * width_col + + w_out; + data_vol += ((channel_in * depth + t_in) * height + h_in) * width + w_in; + for (int i = 0; i < ksize_t; ++i) { + for (int j = 0; j < ksize_h; ++j) { + for (int k = 0; k < ksize_w; ++k) { + auto t = t_in + i * dilation_t; + auto h = h_in + j * dilation_h; + auto w = w_in + k * dilation_w; + *data_col = (t >= 0 && h >= 0 && w >= 0 && t < depth && h < height && + w < width) + ? data_vol + [i * dilation_t * height * width + j * dilation_h * width + + k * dilation_w] + : static_cast(0); + data_col += depth_col * height_col * width_col; + } + } + } + } +} + +template +void vol2col( + cudaStream_t stream, + const T* data_vol, + const int channels, + const int depth, + const int height, + const int width, + const int depth_col, + const int height_col, + const int width_col, + const int ksize_t, + const int ksize_h, + const int ksize_w, + const int pad_t, + const int pad_h, + const int pad_w, + const int stride_t, + const int stride_h, + const int stride_w, + const int dilation_t, + const int dilation_h, + const int dilation_w, + T* data_col) { + // We are going to launch channels * depth_col * height_col * width_col + // kernels, each kernel responsible for copying a single-channel grid. + // We cast an operand to int64 so that the product will not overflow + const auto num_kernels = static_cast(channels) * depth_col * height_col * width_col; + // Launch + vol2col_kernel<<>>( + num_kernels, + data_vol, + depth, + height, + width, + ksize_t, + ksize_h, + ksize_w, + pad_t, + pad_h, + pad_w, + stride_t, + stride_h, + stride_w, + dilation_t, + dilation_h, + dilation_w, + depth_col, + height_col, + width_col, + data_col); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +template +__global__ void vol2im_kernel( + const int64_t n, + const T* data_col, + const unsigned depth, + const unsigned height, + const unsigned width, + const unsigned channels, + const unsigned kernel_t, + const unsigned kernel_h, + const unsigned kernel_w, + const unsigned pad_t, + const unsigned pad_h, + const unsigned pad_w, + const unsigned stride_t, + const unsigned stride_h, + const unsigned stride_w, + const unsigned dilation_t, + const unsigned dilation_h, + const unsigned dilation_w, + const unsigned depth_col, + const unsigned height_col, + const unsigned width_col, + T* data_vol) { + CUDA_KERNEL_LOOP(index, n) { + accT val = static_cast(0); + const auto w_im = index % width + pad_w; + const auto h_im = (index / width) % height + pad_h; + const auto t_im = (index / width / height) % depth + pad_t; + const auto c_im = index / (width * height * depth); + auto kernel_extent_w = (kernel_w - 1) * dilation_w + 1; + auto kernel_extent_h = (kernel_h - 1) * dilation_h + 1; + auto kernel_extent_t = (kernel_t - 1) * dilation_t + 1; + // compute the start and end of the output + const auto w_col_start = + (w_im < kernel_extent_w) ? 0 : (w_im - kernel_extent_w) / stride_w + 1; + const auto w_col_end = std::min(w_im / stride_w + 1, width_col); + const auto h_col_start = + (h_im < kernel_extent_h) ? 0 : (h_im - kernel_extent_h) / stride_h + 1; + const auto h_col_end = std::min(h_im / stride_h + 1, height_col); + const auto t_col_start = + (t_im < kernel_extent_t) ? 0 : (t_im - kernel_extent_t) / stride_t + 1; + const auto t_col_end = std::min(t_im / stride_t + 1, depth_col); + // TODO: use LCM of stride and dilation to avoid unnecessary loops + for (unsigned t_col = t_col_start; t_col < t_col_end; t_col += 1) { + for (unsigned h_col = h_col_start; h_col < h_col_end; h_col += 1) { + for (unsigned w_col = w_col_start; w_col < w_col_end; w_col += 1) { + uint64_t t_k = (t_im - t_col * stride_t); + uint64_t h_k = (h_im - h_col * stride_h); + uint64_t w_k = (w_im - w_col * stride_w); + if (t_k % dilation_t == 0 && h_k % dilation_h == 0 && + w_k % dilation_w == 0) { + t_k /= dilation_t; + h_k /= dilation_h; + w_k /= dilation_w; + const int64_t idx_k = + ((c_im * kernel_t + t_k) * kernel_h + h_k) * kernel_w + w_k; + const int64_t data_col_index = + ((idx_k * depth_col + t_col) * + height_col + h_col) * + width_col + w_col; + val += data_col[data_col_index]; + } + } + } + } + data_vol[index] = static_cast(val); + } +} + +template +void col2vol( + cudaStream_t stream, + const T* data_col, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t output_depth, + const int64_t output_height, + const int64_t output_width, + const int64_t patch_t, + const int64_t patch_h, + const int64_t patch_w, + const int64_t pad_t, + const int64_t pad_h, + const int64_t pad_w, + const int64_t stride_t, + const int64_t stride_h, + const int64_t stride_w, + const int64_t dilation_t, + const int64_t dilation_h, + const int64_t dilation_w, + T* data_vol) { + const auto num_kernels = channels * depth * height * width; + + auto check_fits_in_unsigned = + [](int64_t val, const char * name) { + constexpr auto umax = std::numeric_limits::max(); + TORCH_CHECK(val >= 0 && val <= umax, + name, " must fit in a 32-bit unsigned value"); + }; + check_fits_in_unsigned(num_kernels, "input size"); + check_fits_in_unsigned( + channels * patch_t * patch_h * patch_w, "channels x kernel size"); + + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + vol2im_kernel + <<>>( + num_kernels, + data_col, + depth, + height, + width, + channels, + patch_t, + patch_h, + patch_w, + pad_t, + pad_h, + pad_w, + stride_t, + stride_h, + stride_w, + dilation_t, + dilation_h, + dilation_w, + output_depth, + output_height, + output_width, + data_vol); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_collection.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_collection.h new file mode 100644 index 0000000000000000000000000000000000000000..af45c9e027c88eb4f3d4718d3e9433aefa343893 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_collection.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v4(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_16x16x1_16x16x1_1x16x1x16_4_Intrawave_v4(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_4x64x1_4x64x1_1x16x1x16_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x256x32_32x32_4x4_4x64x1_4x64x1_1x16x1x16_4_Intrawave_v5(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_224x256x64_16x16_7x8_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x224x64_16x16_8x7_8x32x1_8x32x1_1x32x1x8_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_128x128x64_32x32_2x2_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v3(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_128x128x64_32x32_2x2_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v5(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_128x128x64_32x32_2x2_8x32x1_8x32x1_1x16x1x16_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_32x16x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_64_16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_16x8x1_16x8x1_1x16x1x8_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_32x4x1_32x4x1_1x16x1x8_4_Intrawave_v1(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x16x64_16x16_4x1_8x32x1_8x16x1_1x32x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x16x64_16x16_4x1_16x16x1_16x8x1_1x32x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_256x16x64_16x16_4x1_32x8x1_32x4x1_1x32x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_128x16x64_16x16_4x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_64x16x64_16x16_2x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_32x16x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_2_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_64_16x16x64_16x16_1x1_8x8x1_8x8x1_1x16x1x4_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x32x64_16x16_1x1_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x64x64_16x16_1x2_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_128_16x128x64_16x16_1x4_8x16x1_8x16x1_1x16x1x8_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +void bgemm_kernel_bf16bf16bf16_256_16x256x64_16x16_1x4_8x16x1_8x16x1_1x16x1x16_4_Intrawave_v2(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); + +}; // namespace at::native +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h new file mode 100644 index 0000000000000000000000000000000000000000..28bc774c32ea94bcde8039debea15e61c34376a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/bgemm_kernels/bgemm_kernel_template.h @@ -0,0 +1,164 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#undef __HIP_NO_HALF_CONVERSIONS__ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// Define commonly used types. +template +using S = ck::Sequence; + +using BF16 = ck::bhalf_t; +using F32 = float; + +using AccDataType = F32; +using DsDataType = ck::Tuple<>; +using CDataType = BF16; +using CShuffleDataType = BF16; +using DsLayout = ck::Tuple<>; +using CLayout = Row; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using AElementOp = PassThrough; +using BElementOp = PassThrough; +using CDEElementOp = PassThrough; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +template < + typename A_DATA_TYPE, + typename B_DATA_TYPE, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int AK1, + int BK1, + int WAVE_TILE_M, + int WAVE_TILE_N, + int WAVE_MAP_M, + int WAVE_MAP_N, + typename ABLOCK_TRANSFER, + int ABLOCK_TRANSFER_SSPV, + int ABLOCK_TRANSFER_DSPV_K1, + typename BBLOCK_TRANSFER, + int BBLOCK_TRANSFER_SSPV, + int BBLOCK_TRANSFER_SSPV_K1, + int CSHUFFLE_MXDL_PWPS, + int CSHUFFLE_NXDL_PWPS, + typename CSHUFFLEBLOCK_TRANSFER, + typename CDESHUFFLEBLOCK_TRANSFER, + ck::BlockGemmPipelineScheduler LOOP_SCHED, + ck::BlockGemmPipelineVersion PIPELINE_VERSION, + ck::tensor_operation::device::GemmSpecialization GEMM_SPEC = + ck::tensor_operation::device::GemmSpecialization::MNPadding, + bool TRANSA = false, + bool TRANSB = false> +void bgemm_kernel_impl(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + + using ADataType = typename CkMathType::dtype; + using BDataType = typename CkMathType::dtype; + + using ALayout = typename CkTensorLayout::a_layout; + using BLayout = typename CkTensorLayout::b_layout; + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto cde_element_op = CDEElementOp{}; + + auto gemm = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl_CShuffle_V3< + ALayout, // ALayout + BLayout, // BLayout + DsLayout, // DsLayout + CLayout, // CLayout + ADataType, // ADataType + BDataType, // BDataType + DsDataType, // DsDataType + CDataType, // CDataType + AccDataType, // AccDataType + CShuffleDataType, // CshuffleType + AElementOp, // AElementwiseOperation + BElementOp, // BElementwiseOperation + CDEElementOp, // CElementwiseOperation + GEMM_SPEC, // GEMMSpecialization + BLOCK_SIZE, // BlockSize + MBLOCK, // MPerBlock + NBLOCK, // NPerBlock + KBLOCK, // KPerBlock + AK1, // AK1 + BK1, // BK1 + WAVE_TILE_M, // MPerXDL + WAVE_TILE_N, // NPerXDL + WAVE_MAP_M, // MXdlPerWave + WAVE_MAP_N, // NXdlPerWave + ABLOCK_TRANSFER, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + ABLOCK_TRANSFER_SSPV, // ABlockTransferSrcScalarPerVector + ABLOCK_TRANSFER_DSPV_K1, // ABlockTransferDstScalarPerVector_AK1 + 0, // ABlockLdsExtraM + BBLOCK_TRANSFER, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + BBLOCK_TRANSFER_SSPV, // BBlockTransferSrcScalarPerVector + BBLOCK_TRANSFER_SSPV_K1, // BBlockTransferDstScalarPerVector_BK1 + 0, // BBlockLdsAddExtraN + CSHUFFLE_MXDL_PWPS, // CShuffleMXdlPerWavePerShuffle + CSHUFFLE_NXDL_PWPS, // CShuffleNXdlPerWavePerShuffle + CSHUFFLEBLOCK_TRANSFER, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock + CDESHUFFLEBLOCK_TRANSFER, // CDEShuffleBlockTransferScalarPerVectors + LOOP_SCHED, // BlockGemmPipelineScheduler + PIPELINE_VERSION // BlockGemmPipelineVersion + >{}; + auto invoker = gemm.MakeInvoker(); + auto argument = gemm.MakeArgument( + b, // A and B are swapped for CK + a, + {}, + c, + n, + m, + k, + num_batches, + ldb, + lda, + {}, + ldc, + n * k, // batch_stride_a + m * k, // batch_stride_b + {}, + m * n, // batch_stride_c + a_element_op, + b_element_op, + cde_element_op + ); + TORCH_CHECK(gemm.IsSupportedArgument(argument), "wrong! device_gemm with the specified compilation parameters does not support this GEMM problem"); + auto stream = at::cuda::getCurrentHIPStream().stream(); + invoker.Run(argument, StreamConfig{stream, false}); +} + +}; // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_bgemm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_bgemm.h new file mode 100644 index 0000000000000000000000000000000000000000..60ebc92c5d6243fc700be875fbbe05ba44ff0970 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_bgemm.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +template +inline void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) { + static_assert(false&&sizeof(Dtype),"at::cuda::blas_bgemm_internal_ck: not implemented"); +} + +template <> +void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..a20611c638bf9322a1a423a13ac6b43ee476ba63 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +namespace at::native { + + +template +inline void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + static_assert(false&&sizeof(Dtype),"at::cuda::blas_gemm_internal_ck: not implemented"); +} + +#if defined(USE_ROCM) && defined(USE_ROCM_CK_GEMM) +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(double)); +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(float)); +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::Half)); +template <> +void gemm_internal_ck(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); +#endif + + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm_template.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm_template.h new file mode 100644 index 0000000000000000000000000000000000000000..e6c2f52e7b518c581fdbb66f68d179d09a5910ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_gemm_template.h @@ -0,0 +1,413 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#undef __HIP_NO_HALF_CONVERSIONS__ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// Define commonly used types. +template +using S = ck::Sequence; + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +namespace at::native { + +// Elementwise Operators +struct AlphaBetaAdd +{ + AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){}; + + template + __host__ __device__ constexpr void operator()(C& c, const AB& ab) const; + + template<> + __host__ __device__ constexpr void operator() + (float& c, const float& ab) const + { + c = alpha_ * ab; + }; + + template<> + __host__ __device__ constexpr void operator() + (ck::bhalf_t& c, const ck::bhalf_t& ab) const + { + c = alpha_ * ab; + }; + + template<> + __host__ __device__ constexpr void operator() + (ck::half_t& c, const ck::half_t& ab) const + { + c = alpha_ * ab; + }; + + float alpha_; + // TODO: Leaving for now, will use later + float beta_; +}; + +template < + typename Dtype, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int AK1, + int BK1, + int MPER_XDL, + int NPER_XDL, + int MPER_WAVE, + int NPER_WAVE, + typename ABLOCK_CLUSTER_LENS, + typename ABLOCK_CLUSTER_ORDER, + typename ABLOCK_SRC_ORDER, + int ABLOCK_VECTOR_DIM, + int ABLOCK_SCALAR_VEC, + int ABLOCK_SCALAR_VEC_AK1, + bool ABLOCK_LDS_EXTRAM, + typename BBLOCK_CLUSTER_LENS, + typename BBLOCK_CLUSTER_ORDER, + typename BBLOCK_SRC_ORDER, + int BBLOCK_VECTOR_DIM, + int BBLOCK_SCALAR_VEC, + int BBLOCK_SCALAR_VEC_AK1, + bool BBLOCK_LDS_EXTRAN, + int CMPER_WAVE, + int CNPER_WAVE, + typename BLOCK_CLUSTER_LENS, + typename CDE_SCALAR_VEC, + bool PADDING = false, + bool TRANSA = false, + bool TRANSB = false> +void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // Get input information. + int M = m; + int N = n; + int K = k; + + int StrideA = lda; + int StrideB = ldb; + int StrideC = ldc; + + int KBatch = 1; + + float falpha = alpha; + float fbeta = beta; + + using ADataType = typename CkMathType::dtype; + using BDataType = typename CkMathType::dtype; + using CDataType = typename CkMathType::dtype; + using DDataType = typename CkMathType::dtype; + + using AccDataType = float; + using CShuffleDataType = typename CkMathType::dtype; + + using ALayout = typename CkTensorLayout::a_layout; + using BLayout = typename CkTensorLayout::b_layout; + + using DLayout = Row; + using CLayout = Row; + + using AElementOp = PassThrough; + using BElementOp = PassThrough; + using CElementOp = AlphaBetaAdd; + + + static constexpr auto GemmDefault = + ck::tensor_operation::device::GemmSpecialization::Default; + static constexpr auto GemmMNKPadding = + ck::tensor_operation::device::GemmSpecialization::MNKPadding; + static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault; + + + using DeviceGemmInstance = + ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3, + CLayout, + ADataType, + BDataType, + ck::Tuple<>, + CDataType, + AccDataType, + CShuffleDataType, + AElementOp, + BElementOp, + CElementOp, + GemmSpec, + BLOCK_SIZE, + MBLOCK, + NBLOCK, + KBLOCK, + AK1, + BK1, + MPER_XDL, + NPER_XDL, + MPER_WAVE, + NPER_WAVE, + ABLOCK_CLUSTER_LENS, + ABLOCK_CLUSTER_ORDER, + ABLOCK_SRC_ORDER, + ABLOCK_VECTOR_DIM, + ABLOCK_SCALAR_VEC, + ABLOCK_SCALAR_VEC_AK1, + ABLOCK_LDS_EXTRAM, + BBLOCK_CLUSTER_LENS, + BBLOCK_CLUSTER_ORDER, + BBLOCK_SRC_ORDER, + BBLOCK_VECTOR_DIM, + BBLOCK_SCALAR_VEC, + BBLOCK_SCALAR_VEC_AK1, + BBLOCK_LDS_EXTRAN, + CMPER_WAVE, + CNPER_WAVE, + BLOCK_CLUSTER_LENS, + CDE_SCALAR_VEC>; + + + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{alpha, beta}; + + + using DDataArrayType = std::array; + DDataArrayType DDataArray; + + // We swap A and B inputs here as a temporary workaround + auto argument = gemm.MakeArgument( + reinterpret_cast(b), + reinterpret_cast(a), + DDataArray, + reinterpret_cast(c), + N, + M, + K, + StrideB, + StrideA, + std::array{}, + StrideC, + KBatch, + a_element_op, + b_element_op, + c_element_op); + + + TORCH_CHECK(gemm.IsSupportedArgument(argument), "wrong! device_gemm with the specified compilation parameters does not support this GEMM problem"); + + + auto stream = at::cuda::getCurrentHIPStream().stream(); + invoker.Run(argument, StreamConfig{stream, false}); +} + + +template < + typename Dtype, + int BLOCK_SIZE, + int MBLOCK, + int NBLOCK, + int KBLOCK, + int K1, + int MPER_WMMA, + int NPER_WMMA, + int MPER_WAVE, + int NPER_WAVE, + typename ABLOCK_CLUSTER_LENS, + typename ABLOCK_CLUSTER_ORDER, + typename ABLOCK_SRC_ORDER, + int ABLOCK_VECTOR_DIM, + int ABLOCK_SCALAR_VEC, + int ABLOCK_SCALAR_VEC_K1, + bool ABLOCK_LDS_EXTRAM, + typename BBLOCK_CLUSTER_LENS, + typename BBLOCK_CLUSTER_ORDER, + typename BBLOCK_SRC_ORDER, + int BBLOCK_VECTOR_DIM, + int BBLOCK_SCALAR_VEC, + int BBLOCK_SCALAR_VEC_AK1, + bool BBLOCK_LDS_EXTRAN, + int CMPER_WAVE, + int CNPER_WAVE, + typename CBLOCK_CLUSTER_LENS, + int CNPER_BLOCK, + bool PADDING = false, + bool TRANSA = false, + bool TRANSB = false> +void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // Get input information. + int M = m; + int N = n; + int K = k; + + int StrideA = lda; + int StrideB = ldb; + int StrideC = ldc; + + int KBatch = 1; + + float falpha = alpha; + float fbeta = beta; + + using ADataType = typename CkMathType::dtype; + using BDataType = typename CkMathType::dtype; + using CDataType = typename CkMathType::dtype; + using DDataType = typename CkMathType::dtype; + + using AccDataType = float; + using CShuffleDataType = typename CkMathType::dtype; + + using ALayout = typename CkTensorLayout::a_layout; + using BLayout = typename CkTensorLayout::b_layout; + + using DLayout = Row; + using CLayout = Row; + + using AElementOp = PassThrough; + using BElementOp = PassThrough; + using CElementOp = PassThrough; + + + static constexpr auto GemmDefault = + ck::tensor_operation::device::GemmSpecialization::Default; + static constexpr auto GemmMNKPadding = + ck::tensor_operation::device::GemmSpecialization::MNKPadding; + static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault; + + + using DeviceGemmInstance = + ck::tensor_operation::device::DeviceGemmWmma_CShuffle; + + auto gemm = DeviceGemmInstance{}; + auto invoker = gemm.MakeInvoker(); + + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; + auto c_element_op = CElementOp{}; + + + using DDataArrayType = std::array; + DDataArrayType DDataArray; + + // We swap A and B inputs here as a temporary workaround + auto argument = gemm.MakeArgument( + reinterpret_cast(b), + reinterpret_cast(a), + reinterpret_cast(c), + N, + M, + K, + StrideB, + StrideA, + StrideC, + b_element_op, + a_element_op, + c_element_op); + + + if(!gemm.IsSupportedArgument(argument)) + { + printf("error shape = %ld %ld %ld TRANSA=%d TRANSB=%d \n", + n, m, k,TRANSA, TRANSB); + TORCH_CHECK(false, "wrong! device_gemm with the specified compilation parameters does not support this GEMM problem"); + } + + + auto stream = at::cuda::getCurrentHIPStream().stream(); +#if 1 + invoker.Run(argument, StreamConfig{stream, false}); +#else + float ave_time = invoker.Run(argument, StreamConfig{stream, true}); + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << N <<" " < +#include +#include + +namespace at { +namespace hip { +namespace detail { +void group_gemm_ck( + const at::Tensor& mat_a, + const at::Tensor& mat_b, + const std::optional& offs, + const std::optional& bias, + at::Tensor& out); + +} // namespace detail +} // namespace hip +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_types.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_types.h new file mode 100644 index 0000000000000000000000000000000000000000..f3ea17c0b3089705ee3799f5b9e66f271fb4e9f0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/hip/ck_types.h @@ -0,0 +1,80 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// work around CK assuming only a single FP8 interpretation at a time +#if(defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)) && __HIP_DEVICE_COMPILE__ +#define CK_USE_FNUZ_FP8 1 +#undef CK_USE_OCP_FP8 +#elif __HIP_DEVICE_COMPILE__ +#undef CK_USE_FNUZ_FP8 +#define CK_USE_OCP_FP8 1 +#endif + +#include +#include +#include +#include + +using Row = ck::tensor_layout::gemm::RowMajor; +using Col = ck::tensor_layout::gemm::ColumnMajor; + +namespace at::native { + +template +struct CkMathType { + using dtype = T; +}; + +template <> +struct CkMathType { + using dtype = ck::bhalf_t; +}; + +template <> +struct CkMathType { + using dtype = ck::half_t; +}; + +template +struct CkTensorLayout { + // default goes to row-wise for now + using a_layout = Row; + using b_layout = Row; +}; + +// True denotes transpose is necessary. Default is Col, so return Row +template <> +struct CkTensorLayout { + using a_layout = Col; + using b_layout = Col; +}; + +template <> +struct CkTensorLayout { + using a_layout = Row; + using b_layout = Col; +}; + +template <> +struct CkTensorLayout { + using a_layout = Col; + using b_layout = Row; +}; + +template <> +struct CkTensorLayout { + using a_layout = Row; + using b_layout = Row; +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_kernels.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_kernels.h new file mode 100644 index 0000000000000000000000000000000000000000..27c797343b6563639c5ec82b95f5e82b10d7efc9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_kernels.h @@ -0,0 +1,48 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#if AT_KLEIDIAI_ENABLED() + +namespace at::native::kleidiai { + +/** + * @brief Rearranges the quantized weight to support kleidiai inference + * @param bl Groupsize for quantization should be multiple of 32 + */ +void kai_pack_int4_rhs( + const Tensor& weight_packed, + const Tensor& weight, + const Tensor& scales, + const std::optional& bias, + const int64_t n, + const int64_t k, + const int64_t bl); + +/** + * @brief Outputs the buffer size for the packed weights + * @param bl Groupsize for quantization. 32 for groupwise , 0 for channelwise + */ +size_t kai_pack_rhs_int4_size( + const int64_t n, + const int64_t k, + const int64_t bl, + at::ScalarType tensor_dtype = at::kFloat); + +/** + * @brief Run 2 operations ( Input quantize and pack -> 4 bit Matmul ) + */ +void kai_quant_pack_lhs_int4_mm( + const Tensor& output, + const Tensor& input, + const Tensor& weight, + const int64_t m, + const int64_t n, + const int64_t k, + const int64_t bl); +} // namespace at::native::kleidiai +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_pack.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_pack.h new file mode 100644 index 0000000000000000000000000000000000000000..30910a6add13b5f452e2f457fe3346bdf34c6014 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_pack.h @@ -0,0 +1,114 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#if AT_KLEIDIAI_ENABLED() + +namespace at::native::kleidiai { + +template +void kai_pack_rhs_groupwise_int4( + T& kernel, + const Tensor& weight_packed, + const Tensor& weight, + const Tensor& scales, + const std::optional& bias, + const int64_t n, + const int64_t k, + const int64_t bl, + const int64_t rhs_stride, + const int64_t scale_stride) { + const auto& ukernel = kernel.ukernel; + const size_t nr = ukernel.get_nr(); + const size_t kr = ukernel.get_kr(); + const size_t sr = ukernel.get_sr(); + auto weight_packed_data = + reinterpret_cast(weight_packed.data_ptr()); + const auto weight_data = weight.data_ptr(); + auto scales_data = scales.const_data_ptr(); + + if (weight_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Weight data pointer is null"); + } + + if (scales_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null"); + } + + float* bias_ptr = + bias.has_value() ? bias.value().to(kFloat).data_ptr() : NULL; + auto& params = kernel.rhs_pack_params; + + kernel.kai_run_rhs_pack( + /*num_groups=*/1, + n, + k, + nr, + kr, + sr, + bl, + (const uint8_t*)(weight_data), + rhs_stride, + bias_ptr, + scales_data, + scale_stride, + weight_packed_data, + 0, + ¶ms); +} + +template +void kai_pack_rhs_channelwise_int4( + T& kernel, + const Tensor& weight_packed, + const Tensor& weight, + const Tensor& scales, + const std::optional& bias, + const int64_t n, + const int64_t k) { + const auto& ukernel = kernel.ukernel; + const size_t nr = ukernel.get_nr(); + const size_t kr = ukernel.get_kr(); + const size_t sr = ukernel.get_sr(); + auto weight_packed_data = + reinterpret_cast(weight_packed.data_ptr()); + const auto weight_data = weight.data_ptr(); + + const auto scales_data = scales.to(kFloat).data_ptr(); + + if (weight_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Weight data pointer is null"); + } + + if (scales_data == nullptr) { + AT_ERROR("kai_pack_rhs_channelwise_int4: Scales data pointer is null"); + } + + float* bias_ptr = + bias.has_value() ? bias.value().to(kFloat).data_ptr() : NULL; + auto& params = kernel.rhs_pack_params; + + kernel.kai_run_rhs_pack( + /*num_groups=*/1, + n, + k, + nr, + kr, + sr, + (const uint8_t*)(weight_data), + (const float*)(bias_ptr), + (const float*)(scales_data), + weight_packed_data, + 0, + ¶ms); +} + +} // namespace at::native::kleidiai + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_ukernel_interface.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_ukernel_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..3def42aa5876304b811616cf76cd8299f446fd15 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/kleidiai/kai_ukernel_interface.h @@ -0,0 +1,226 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#if AT_KLEIDIAI_ENABLED() +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native::kleidiai { + +enum class kai_kernel_id { + // FP32 inputs, 4-bit weights, FP32 output + matmul_clamp_f32_qai8dxp1x8_qsi4c32p8x8_1x8x32_neon_dotprod = + 0, // Groupwise 4-bit GEMV (per-group scales, NEON DOTPROD) + matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_4x8x32_neon_i8mm = + 1, // Groupwise 4-bit GEMM (per-group scales, NEON I8MM) + matmul_clamp_f32_qai8dxp1x8_qsi4cxp8x8_1x8x32_neon_dotprod = + 2, // Channelwise 4-bit GEMV (per-channel scales, NEON DOTPROD) + matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm = + 3, // Channelwise 4-bit GEMM (per-channel scales, NEON I8MM) + + // BF16 inputs, 4-bit weights, BF16 output + matmul_clamp_bf16_qai8dxp1x8_qsi4cxp8x8_1x8_neon_dotprod = + 4, // Channelwise 4-bit GEMV with BF16 input/output + matmul_clamp_bf16_qai8dxp4x8_qsi4cxp8x8_8x8_neon_i8mm = + 5 // Channelwise 4-bit GEMM with BF16 input/output +}; + +// Channelwise Kernel mapping +struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp { + struct kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel ukernel; + struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params rhs_pack_params; + size_t (*kai_get_lhs_packed_size)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr); + size_t (*kai_get_rhs_packed_size)( + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr); + void (*kai_run_lhs_quant_pack)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr, + size_t m_idx_start, + const float* lhs, + size_t lhs_stride, + void* lhs_packed); + void (*kai_run_rhs_pack)( + size_t num_groups, + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + const uint8_t* rhs, + const float* bias, + const float* scale, + void* rhs_packed, + size_t extra_bytes, + const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params); + size_t(*kai_get_lhs_quant_pack_offset)( + size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr + ); + + kai_matmul_ukernel_f32_qa8dxp_qs4cxp( + const kai_matmul_clamp_f32_qai8dxp_qsi4cxp_ukernel& kernel) + : ukernel(kernel), + kai_get_lhs_packed_size( + &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32), + kai_get_rhs_packed_size( + &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32), + kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32){} +}; + +struct kai_matmul_ukernel_f32_qa8dxp_qs4cxp +kai_select_channelwise_matmul_ukernel(const kai_kernel_id id); + +// bf16 Channelwise Kernel mapping +struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp { + struct kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel ukernel; + struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params rhs_pack_params; + size_t (*kai_get_lhs_packed_size)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr); + size_t (*kai_get_rhs_packed_size)( + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr); + void (*kai_run_lhs_quant_pack)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr, + size_t m_idx_start, + const void* lhs, + size_t lhs_stride, + void* lhs_packed); + void (*kai_run_rhs_pack)( + size_t num_groups, + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + const uint8_t* rhs, + const float* bias, + const float* scale, + void* rhs_packed, + size_t extra_bytes, + const struct kai_rhs_pack_nxk_qsi4cxp_qs4cxs1s0_params* params); + size_t(*kai_get_lhs_quant_pack_offset)( + size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr + ); + + kai_matmul_ukernel_bf16_qa8dxp_qs4cxp( + const kai_matmul_clamp_bf16_qai8dxp_qsi4cxp_ukernel& kernel) + : ukernel(kernel), + kai_get_lhs_packed_size( + &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_bf16_neon), + kai_get_rhs_packed_size( + &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_bf16_neon), + kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4cxp_qs4cxs1s0), + kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_bf16_neon){} + }; + +struct kai_matmul_ukernel_bf16_qa8dxp_qs4cxp +kai_select_bf16_channelwise_matmul_ukernel(const kai_kernel_id id); + +// Groupwise Kernel mapping +struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p { + struct kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel ukernel; + struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params rhs_pack_params; + size_t (*kai_get_lhs_packed_size)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr); + size_t (*kai_get_rhs_packed_size)( + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + size_t bl, + enum kai_datatype scale_dt); + void (*kai_run_lhs_quant_pack)( + size_t m, + size_t k, + size_t mr, + size_t kr, + size_t sr, + size_t m_idx_start, + const float* lhs, + size_t lhs_stride, + void* lhs_packed); + void (*kai_run_rhs_pack)( + size_t num_groups, + size_t n, + size_t k, + size_t nr, + size_t kr, + size_t sr, + size_t bl, + const uint8_t* rhs, + size_t rhs_stride, + const float* bias, + const void* scale, + size_t scale_stride, + void* rhs_packed, + size_t extra_bytes, + const struct kai_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0_params* params); + size_t(*kai_get_lhs_quant_pack_offset)( + size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr + ); + + kai_matmul_ukernel_f32_qa8dxp_qs4c32p( + const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& kernel) + : ukernel(kernel), + kai_get_lhs_packed_size( + &kai_get_lhs_packed_size_lhs_quant_pack_qai8dxp_f32), + kai_get_rhs_packed_size( + &kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0), + kai_run_lhs_quant_pack(&kai_run_lhs_quant_pack_qai8dxp_f32), + kai_run_rhs_pack(&kai_run_rhs_pack_nxk_qsi4c32p_qsu4c32s1s0), + kai_get_lhs_quant_pack_offset(&kai_get_lhs_packed_offset_lhs_quant_pack_qai8dxp_f32) {} +}; + +struct kai_matmul_ukernel_f32_qa8dxp_qs4c32p kai_select_groupwise_matmul_ukernel( + const kai_kernel_id id); + +} // namespace at::native::kleidiai +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/Conv.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/Conv.h new file mode 100644 index 0000000000000000000000000000000000000000..81ede7e28c9e451b9d39edb0054244c8b3bec5ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/Conv.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +#if AT_MKLDNN_ENABLED() + +namespace at::native::xpu { +C10_API Tensor convolution_pointwise( + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + +C10_API Tensor convolution_pointwise_binary( + const Tensor& input_t, + const Tensor& other_t, + const Tensor& weight_t, + const std::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +C10_API Tensor& convolution_pointwise_binary_( + Tensor& other_t, + const Tensor& input_t, + const Tensor& weight_t, + const std::optional& bias_opt, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +} // namespace at::native::xpu + +#endif // AT_MKLDNN_ENABLED() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/FusionUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/FusionUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..77cd91aadda0ab29144b7448a4790ae285bb9fda --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/FusionUtils.h @@ -0,0 +1,58 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// +// This header file provides utility functions for constructing and managing +// oneDNN attributes used in fusion operations on XPU devices. These utilities +// include functions for creating unary and binary post-operations attributes, +// as well as mapping string representations of operations to oneDNN attributes. +// + +namespace at::native::xpu { +at::native::onednn::Attr& unary_attr_with_arg( + onednn::Attr& attr, + std::string_view unary, + torch::List> scalars, + std::optional algorithm); + +at::native::onednn::Attr& string_to_unary_attr( + onednn::Attr& attr, + std::string_view unary); + +at::native::onednn::Attr& construct_unary_attr( + onednn::Attr& attr, + std::string_view unary, + torch::List> scalars, + std::optional algorithm); + +template +onednn::Attr& construct_binary_attr( + onednn::Attr& attr, + std::string_view binary, + const Tensor& other) { + if (binary == "mul") { + attr.append_post_binary(attr.kind_with_binary_mul, other); + } else if (binary == "sub") { + attr.append_post_binary(attr.kind_with_binary_sub, other); + } else if (binary == "div") { + attr.append_post_binary(attr.kind_with_binary_div, other); + } else if (binary == "add") { + attr.append_post_binary(attr.kind_with_binary_add, other); + } else if (binary == "sum") { + attr.append_post_sum(1.f, 1.f, 0); + } else { + TORCH_CHECK( + binary == "none", + "Binary attr ", + binary, + "is not supported for conv/linear post binary fusion"); + } + return attr; +} + +} // namespace at::native::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Attr.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Attr.h new file mode 100644 index 0000000000000000000000000000000000000000..dba538396fe3b1c6ae6af47e8cc7ba5368c910a3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Attr.h @@ -0,0 +1,468 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native::onednn { +/* oneDNN quantization usage: + https://oneapi-src.github.io/oneDNN/dev_guide_attributes_quantization.html# + + src_fp32 = scale_src * (src_int8 - zero_point) + wei_fp32 = scale_wei * (wei_int8 - zero_point) + dst_fp32 = scale_dst * (dst_int8 - zero_point) + fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei) + Int8 Convolution: dst_int8 = 1 / scale_dst * dst_fp32; + + Considering zero-point (asymmetric): + dst_fp32 = (src_int8 - src_zp) * src_sc * wei_int8 * wei_sc + dst_sc * (dst_int8 - dst_zp) = (src_int8 - src_zp) * wei_int8 * src_sc * + wei_sc + dst_int8 = (src_int8 - src_zp) * wei_int8 * src_sc * wei_sc / dst_sc + + dst_zp + + considering bias: + fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + bias + Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei) + + bias Int8 Convolution: dst_fp32 = (src_int8 * wei_int8 + bias/(scale_src * + scale_wei)) * (scale_src * scale_wei) Int8 Convolution: dst_int8 = 1 / + scale_dst * dst_fp32; +*/ + +/* + oneDNN postops usage: + Currently, oneDNN supports 5 kinds of post ops. More details can be referred +to oneDNN doc. + https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise + +0. without post ops + dst = Conv(src, wei) + bias; + dst_int8 = 1/q_scale * dst; q_scale is the op output quantization scale + fp32 API: Attr attr; + int8 API: Attr attr(q_scale); + +1. append eltwise post op + dst = elt_scale * Eltwise{conv_scale * [Conv(src, wei) + bias], alpha, beta} + dst_int8 = 1/q_scale * dst; + fp32 API: + Attr attr; + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm) + int8 API: + Attr attr(q_scale); + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm) + +2. append sum post op + dst = conv_scale * Conv(src, wei) + sum_scale * (dst - zp) + dst_int8 = 1/q_scale * dst; + fp32 API: + Attr attr; + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_sum(sum_scale) + int8 API: + Attr attr(q_scale); + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_sum(sum_scale) + +3. append binary post op + dst = Binary[Conv(src, wei)] + +*/ +using kind_t = dnnl::primitive::kind; +struct PostOpParam { + // eltwise post op constructor + PostOpParam( + float scale, + float alpha, + float beta, + dnnl::algorithm algo, + kind_t kind) + : scale_(scale), alpha_(alpha), beta_(beta), algo_(algo), kind_(kind) {} + // sum post op constructor + PostOpParam(float scale, kind_t kind) : scale_(scale), kind_(kind) {} + // sum post op with zp + PostOpParam(float scale, int64_t zero_point, kind_t kind) + : scale_(scale), zero_point_(zero_point), kind_(kind) {} + // binary post op constructor + PostOpParam( + at::Tensor& binary, + dnnl::memory::desc& binary_md, + dnnl::memory::desc& expected_md, + dnnl::algorithm algo, + kind_t kind) + : binary_(binary), + meta_(binary_md), + expected_meta_(expected_md), + algo_(algo), + kind_(kind) {} + // prelu post op constructor + PostOpParam(int mask, kind_t kind) : mask_(mask), kind_(kind) {} + + // post sum or binary with scale post op constructor + PostOpParam( + at::Tensor& binary, + float scale, + dnnl::algorithm algo, + kind_t kind) + : scale_(scale), binary_(binary), algo_(algo), kind_(kind) {} + + // for int8 sum/eltwise + float scale_ = 1.0; + int64_t zero_point_ = 0; + // for eltwise + float alpha_ = 0.0; + float beta_ = 0.0; + // for binary + at::Tensor binary_ = at::Tensor(); + at::Tensor expected_binary_ = at::Tensor(); + void* binary_ptr_ = nullptr; + dnnl::memory::desc meta_ = dnnl::memory::desc(); + dnnl::memory::desc expected_meta_ = dnnl::memory::desc(); + // for prelu + int mask_ = 0; + // common + dnnl::algorithm algo_ = dnnl::algorithm::eltwise_relu; + kind_t kind_ = kind_t::eltwise; +}; + +class Attr { + public: + Attr() : q_scale_(1.f) {} + Attr(float q_scale, int64_t zp = 0) : q_scale_(q_scale), q_zero_point_(zp) {} + + /***** eltwise *****/ + dnnl::algorithm kind_with_relu = dnnl::algorithm::eltwise_relu; + dnnl::algorithm kind_with_sigmoid = dnnl::algorithm::eltwise_logistic; + dnnl::algorithm kind_with_gelu_tanh = dnnl::algorithm::eltwise_gelu_tanh; + dnnl::algorithm kind_with_gelu_erf = dnnl::algorithm::eltwise_gelu_erf; + dnnl::algorithm kind_with_mish = dnnl::algorithm::eltwise_mish; + dnnl::algorithm kind_with_linear = dnnl::algorithm::eltwise_linear; + dnnl::algorithm kind_with_swish = dnnl::algorithm::eltwise_swish; + dnnl::algorithm kind_with_sqrt = dnnl::algorithm::eltwise_sqrt; + dnnl::algorithm kind_with_tanh = dnnl::algorithm::eltwise_tanh; + dnnl::algorithm kind_with_square = dnnl::algorithm::eltwise_square; + dnnl::algorithm kind_with_abs = dnnl::algorithm::eltwise_abs; + dnnl::algorithm kind_with_exp = dnnl::algorithm::eltwise_exp; + dnnl::algorithm kind_with_log = dnnl::algorithm::eltwise_log; + dnnl::algorithm kind_with_round = dnnl::algorithm::eltwise_round; + dnnl::algorithm kind_with_hardswish = dnnl::algorithm::eltwise_hardswish; + dnnl::algorithm kind_with_soft_relu = dnnl::algorithm::eltwise_soft_relu; + dnnl::algorithm kind_with_elu = dnnl::algorithm::eltwise_elu; + dnnl::algorithm kind_with_pow = dnnl::algorithm::eltwise_pow; + dnnl::algorithm kind_with_clip = dnnl::algorithm::eltwise_clip; + // note: hardsigmoid seems oneDNN still not support + dnnl::algorithm kind_with_hardsigmoid = dnnl::algorithm::eltwise_hardsigmoid; + + /***** binary *****/ + dnnl::algorithm kind_with_binary_mul = dnnl::algorithm::binary_mul; + dnnl::algorithm kind_with_binary_add = dnnl::algorithm::binary_add; + dnnl::algorithm kind_with_binary_sub = dnnl::algorithm::binary_sub; + dnnl::algorithm kind_with_binary_div = dnnl::algorithm::binary_div; + dnnl::algorithm kind_with_binary_eq = dnnl::algorithm::binary_eq; + dnnl::algorithm kind_with_binary_ne = dnnl::algorithm::binary_ne; + dnnl::algorithm kind_with_binary_ge = dnnl::algorithm::binary_ge; + dnnl::algorithm kind_with_binary_gt = dnnl::algorithm::binary_gt; + dnnl::algorithm kind_with_binary_le = dnnl::algorithm::binary_le; + dnnl::algorithm kind_with_binary_lt = dnnl::algorithm::binary_lt; + dnnl::algorithm kind_with_binary_max = dnnl::algorithm::binary_max; + dnnl::algorithm kind_with_binary_min = dnnl::algorithm::binary_min; + + // append sum post op + Attr& append_post_sum( + float sum_scale, + float sum_q_scale = 1.f, + int64_t zp = 0) { + ops_params_.push_back( + PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, zp, kind_t::sum)); + return *this; + } + + // append eltwise post op + Attr& append_post_eltwise( + float scale, + float alpha, + float beta, + dnnl::algorithm algo) { + ops_params_.push_back( + PostOpParam(scale, alpha, beta, algo, kind_t::eltwise)); + return *this; + } + + // append binary post op + template + Attr& append_post_binary(dnnl::algorithm algo, const at::Tensor& binary) { + auto binary_ = binary.is_quantized() ? at::dequantize(binary) : binary; + bool binary_is_channels_last = + (binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast || + binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d); + + if constexpr (!is_matmul) { + binary_ = binary_is_channels_last ? binary_ : binary_.contiguous(); + } + dnnl::memory::desc md = get_onednn_md(binary_); + auto expected_md = dnnl::memory::desc( + md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::any); + if constexpr (is_matmul) { + ops_params_.push_back(PostOpParam(binary_, md, md, algo, kind_t::binary)); + } else { + ops_params_.push_back( + PostOpParam(binary_, md, expected_md, algo, kind_t::binary)); + } + + return *this; + } + + Attr& append_scale_binary( + dnnl::algorithm algo, + at::Tensor binary, + float scale, + float sum_q_scale = 1.f, + int64_t zp = 0) { + ops_params_.push_back(PostOpParam( + binary, /*scale_sum*/ scale * sum_q_scale, algo, kind_t::binary)); + return *this; + } + + // append bias with binary_add method (only used for QConv now) + Attr& append_bias(const at::Tensor& binary, const int ndimension) { + // In PyTorch, bias are in shape of [OC], + // we expand its shape according to Conv dimension + // Conv1d [OC, 1, 1], Conv2d [1, OC, 1, ,1], Conv3d [1, OC, 1, 1, 1] + at::Tensor binary_ = binary.contiguous(); + dnnl::memory::desc binary_md; + switch (ndimension) { + case 1: + binary_md = dnnl::memory::desc( + {binary.size(0), 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abc); + break; + case 2: + binary_md = dnnl::memory::desc( + {1, binary.size(0), 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcd); + break; + case 3: + binary_md = dnnl::memory::desc( + {1, binary.size(0), 1, 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcde); + break; + default: + TORCH_INTERNAL_ASSERT( + 0, "XPU only supports append_bias for Conv1d, Conv2d and Conv3d."); + } + // In this case, expected_md = binary_md + ops_params_.push_back(PostOpParam( + binary_, binary_md, binary_md, kind_with_binary_add, kind_t::binary)); + return *this; + } + + // append prelu post op + Attr& append_post_prelu(int mask) { + ops_params_.push_back(PostOpParam(mask, kind_t::prelu)); + return *this; + } + + dnnl::post_ops extract_post_ops(const at::Tensor& dst) { + // this function is used to extract post ops params from the ops_params_ + // and put them into onednn post ops + for (size_t i = 0; i < ops_params_.size(); ++i) { + kind_t kind = ops_params_[i].kind_; + switch (kind) { + case kind_t::eltwise: { + dnnl::algorithm algo = ops_params_[i].algo_; + float alpha = ops_params_[i].alpha_; + float beta = ops_params_[i].beta_; + dnnl_post_ops_.append_eltwise(algo, alpha, beta); + break; + } + case kind_t::sum: { + float scale = ops_params_[i].scale_; + int64_t zero_point = ops_params_[i].zero_point_; + // TODO [Asymmetric]: + // Post-sum zp for gpu is not supported currently + dnnl_post_ops_.append_sum(scale, zero_point); + break; + } + case kind_t::binary: { + dnnl::algorithm algo = ops_params_[i].algo_; + auto expected_md = ops_params_[i].expected_meta_; + // In this case user may create src1 memory descriptor with + // format_tag::any or set a specific tag. However, in later case if + // tags mismatch with dst, it would result in suboptimal performance. + // So here we use format_tag::any to make sure the fast can be + // selected. + // Thus we use expected_md (with format_any) here to create pd instead + // of original md + dnnl_post_ops_.append_binary(algo, expected_md); + break; + } + default: + break; + } + } + + return dnnl_post_ops_; + } + + bool with_sum() { + for (size_t i = 0; i < ops_params_.size(); ++i) { + if (ops_params_[i].kind_ == kind_t::sum) { + return true; + } + } + return false; + } + + bool with_binary() { + for (size_t i = 0; i < ops_params_.size(); ++i) { + if (ops_params_[i].kind_ == kind_t::binary) { + return true; + } + } + return false; + } + + void construct_post_binary( + dnnl::primitive_desc& pd, + std::unordered_map& args) { + // This function is used to construct binary memory desc in binary post ops. + // According to oneDNN doc, the binary tensor can be in shape of + // [1, 1, 1, 1], tensor broadcast + // [1, C, 1, 1], channel broadcast + // [dst.shape], no broadcast and eltwise-wise binary operations on dst + + auto& engine = GpuEngineManager::Instance().get_engine(); + for (size_t i = 0; i < ops_params_.size(); ++i) { + kind_t kind = ops_params_[i].kind_; + if (kind == kind_t::binary) { + dnnl::memory binary_m; + auto binary = ops_params_[i].binary_; + auto md = ops_params_[i].meta_; + // query expected_md to achieve peak performance + auto expected_md = pd.query_md( + dnnl::query::exec_arg_md, + DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1); + + binary_m = at::native::onednn::make_onednn_memory( + md, engine, binary.data_ptr()); + + args.insert( + {DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binary_m}); + } + } + } + + float q_scale_ = 1.0; // the scale used to quantize the fused result from fp32 + // to int8, only works for int8 case + int64_t q_zero_point_ = 0; + std::vector ops_params_; // series of post ops + dnnl::post_ops dnnl_post_ops_; +}; + +static inline void construct_attr_for_unary( + const std::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const std::string_view& unary_post_op_algorithm, + at::native::onednn::Attr& attr) { + if (unary_post_op == "relu") { + attr = attr.append_post_eltwise( + /* eltwise_scale */ 1.f, + /* alpha */ 0.f, + /* beta */ 0.f, + attr.kind_with_relu); + } else if (unary_post_op == "leaky_relu") { + auto alpha = unary_post_op_args[0].value().to(); + attr = attr.append_post_eltwise(1.0, alpha, 0.f, attr.kind_with_relu); + } else if (unary_post_op == "tanh") { + attr = attr.append_post_eltwise(1.0f, 0.0f, 0.0f, attr.kind_with_tanh); + } else if (unary_post_op == "gelu") { + auto post_algorithm = unary_post_op_algorithm == "none" + ? attr.kind_with_gelu_erf + : attr.kind_with_gelu_tanh; + attr = attr.append_post_eltwise(1.0f, 0.0f, 0.0f, post_algorithm); + } else if (unary_post_op == "hardtanh") { + auto alpha = unary_post_op_args[0].value().to(); + auto beta = unary_post_op_args[1].value().to(); + attr = attr.append_post_eltwise(1.0, alpha, beta, attr.kind_with_clip); + } else if (unary_post_op == "hardswish") { + attr = attr.append_post_eltwise( + 1.0f, 1.f / 6.f, 1.f / 2.f, attr.kind_with_hardswish); + } else if (unary_post_op == "swish") { + attr = attr.append_post_eltwise(1.0f, 1.0f, 0.0f, attr.kind_with_swish); + } else { + TORCH_CHECK( + unary_post_op == "none", + "onednn qlinear: unsupported unary post op", + unary_post_op); + } +} + +static inline void construct_attr_by_post_op( + const std::string_view& binary_post_op, + double binary_alpha, + double input1_scale, + int64_t input1_zero_point, + std::optional accum, + const std::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const std::string_view& unary_post_op_algorithm, + at::native::onednn::Attr& attr) { + bool is_none_post_op = + (binary_post_op == "none" && unary_post_op == "none"); // not post-ops + bool is_unary_post_op_only = + (binary_post_op == "none" && unary_post_op != "none"); // ex., conv + relu + bool is_valid_binary_combination = + (binary_post_op == "add" || binary_post_op == "sum") && + (unary_post_op == "none" || unary_post_op == "relu"); + TORCH_INTERNAL_ASSERT( + is_unary_post_op_only || is_none_post_op || is_valid_binary_combination, + "Please provide valid combination of unary post operators and binary post operators"); + + if (binary_post_op == "none") { + construct_attr_for_unary( + unary_post_op, unary_post_op_args, unary_post_op_algorithm, attr); + } else if (binary_post_op == "sum") { + if (unary_post_op == "none") { + if (input1_zero_point != 0) + attr = attr.append_post_eltwise( + /*scale*/ 1.f, + /*alpha*/ 1.f, + -input1_zero_point * input1_scale, + attr.kind_with_linear); + attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0); + } else if (unary_post_op == "relu") { + if (input1_zero_point != 0) + attr = attr.append_post_eltwise( + /*scale*/ 1.f, + /*alpha*/ 1.f, + -input1_zero_point * input1_scale, + attr.kind_with_linear); + attr = attr.append_post_sum(1, input1_scale, /*input1_zero_point*/ 0); + attr = attr.append_post_eltwise( + /* scale */ 1.f, + /* alpha */ 0.f, + /* beta */ 0.f, + attr.kind_with_relu); + } + } else if (binary_post_op == "add") { + TORCH_CHECK(accum.has_value()); + attr = attr.append_post_binary(attr.kind_with_binary_add, accum.value()); + if (unary_post_op == "relu") { + attr = attr.append_post_eltwise(1.f, 0.f, 0.f, attr.kind_with_relu); + } + } +} + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/DnnlExt.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/DnnlExt.h new file mode 100644 index 0000000000000000000000000000000000000000..eb755555172f640ee919795974124a6466e55b89 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/DnnlExt.h @@ -0,0 +1,599 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include + +#include +#include + +namespace std { + +template <> +struct hash { + size_t operator()(dnnl::memory::dims const& vec) const { + size_t seed = vec.size(); + for (auto& i : vec) { + seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + return seed; + } +}; + +} // namespace std + +using namespace dnnl; + +namespace at::native::onednn { + +class primitive_ext : public primitive { + static constexpr int max_args = 12; + + public: + primitive_ext(const primitive& base) : primitive(base) {} + primitive_ext(primitive&& base) : primitive(std::move(base)) {} + + /// Returns a memory descriptor. + /// + /// @note + /// There are also convenience methods + /// #dnnl::primitive_desc_base::src_desc(), + /// #dnnl::primitive_desc_base::dst_desc(), and others. + /// + /// @param what The kind of parameter to query; can be + /// #dnnl::query::src_md, #dnnl::query::dst_md, etc. + /// @param idx Index of the parameter. For example, convolution bias can + /// be queried with what = #dnnl::query::weights_md and idx = 1. + /// @returns The requested memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// parameter of the specified kind or index. + const_dnnl_memory_desc_t query_md(query what, int idx = 0) const { + std::vector valid_q{ + query::src_md, + query::diff_src_md, + query::weights_md, + query::diff_weights_md, + query::dst_md, + query::diff_dst_md, + query::workspace_md, + query::scratchpad_md, + query::exec_arg_md}; + if (!std::any_of(valid_q.cbegin(), valid_q.cend(), [=](query q) { + return what == q; + })) + DNNL_THROW_ERROR( + dnnl_invalid_arguments, "memory descriptor query is invalid"); + + const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md( + this->get_primitive_desc(), dnnl::convert_to_c(what), idx); + + return cdesc ? cdesc : nullptr; + } + + /// Returns a source memory descriptor. + /// @param idx Source index. + /// @returns Source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source parameter with index @p idx. + const_dnnl_memory_desc_t src_desc(int idx) const { + return query_md(query::src_md, idx); + } + + /// Returns a destination memory descriptor. + /// @param idx Destination index. + /// @returns Destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination parameter with index @p idx. + const_dnnl_memory_desc_t dst_desc(int idx) const { + return query_md(query::dst_md, idx); + } + + /// Returns a weights memory descriptor. + /// @param idx Weights index. + /// @returns Weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// weights parameter with index @p idx. + const_dnnl_memory_desc_t weights_desc(int idx) const { + return query_md(query::weights_md, idx); + } + + /// Returns a diff source memory descriptor. + /// @param idx Diff source index. + /// @returns Diff source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source parameter with index @p idx. + const_dnnl_memory_desc_t diff_src_desc(int idx) const { + return query_md(query::diff_src_md, idx); + } + + /// Returns a diff destination memory descriptor. + /// @param idx Diff destination index. + /// @returns Diff destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination parameter with index @p idx. + const_dnnl_memory_desc_t diff_dst_desc(int idx) const { + return query_md(query::diff_dst_md, idx); + } + + /// Returns a diff weights memory descriptor. + /// @param idx Diff weights index. + /// @returns Diff weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff weights parameter with index @p idx. + const_dnnl_memory_desc_t diff_weights_desc(int idx) const { + return query_md(query::diff_weights_md, idx); + } + + const_dnnl_memory_desc_t exec_arg_desc(int idx) const { + return query_md(query::exec_arg_md, idx); + } + + // Separate versions without the index argument for documentation + // purposes. + + /// Returns a source memory descriptor. + /// @returns Source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// source parameter. + const_dnnl_memory_desc_t src_desc() const { + return src_desc(0); + } + + /// Returns a destination memory descriptor. + /// @returns Destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// destination parameter. + const_dnnl_memory_desc_t dst_desc() const { + return dst_desc(0); + } + + /// Returns a weights memory descriptor. + /// @returns Weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// weights parameter. + const_dnnl_memory_desc_t weights_desc() const { + return weights_desc(0); + } + + /// Returns a diff source memory descriptor. + /// @returns Diff source memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff source memory with. + const_dnnl_memory_desc_t diff_src_desc() const { + return diff_src_desc(0); + } + + /// Returns a diff destination memory descriptor. + /// @returns Diff destination memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff destination parameter. + const_dnnl_memory_desc_t diff_dst_desc() const { + return diff_dst_desc(0); + } + + /// Returns a diff weights memory descriptor. + /// @returns Diff weights memory descriptor. + /// @returns A zero memory descriptor if the primitive does not have a + /// diff weights parameter. + const_dnnl_memory_desc_t diff_weights_desc() const { + return diff_weights_desc(0); + } + + /// Returns the workspace memory descriptor. + /// @returns Workspace memory descriptor. + /// @returns A zero memory descriptor if the primitive does not require + /// workspace parameter. + const_dnnl_memory_desc_t workspace_desc() const { + return query_md(query::workspace_md, 0); + } + + /// Returns the scratchpad memory descriptor. + /// @returns scratchpad memory descriptor. + /// @returns A zero memory descriptor if the primitive does not require + /// scratchpad parameter. + /// @sa @ref dev_guide_attributes_scratchpad + const_dnnl_memory_desc_t scratchpad_desc() const { + return query_md(query::scratchpad_md, 0); + } + + inline memory make_memory( + const_dnnl_memory_desc_t md_t, + const engine& aengine, + void* handle = DNNL_MEMORY_ALLOCATE) const { + sycl_interop::memory_kind kind = dnnl::sycl_interop::memory_kind::usm; + dnnl_memory_t c_memory; + error::wrap_c_api( + dnnl_sycl_interop_memory_create( + &c_memory, md_t, aengine.get(), convert_to_c(kind), handle), + "could not create a memory"); + return memory(c_memory); + } + + memory make_src(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(src_desc(), aengine, handle); + } + + memory make_weight(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(weights_desc(), aengine, handle); + } + + memory make_bias(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(weights_desc(1), aengine, handle); + } + + memory make_dst(const engine& aengine, void* handle = DNNL_MEMORY_ALLOCATE) + const { + return make_memory(dst_desc(), aengine, handle); + } + + memory make_scratchpad( + const engine& aengine, + void* handle = DNNL_MEMORY_ALLOCATE) const { + return make_memory(scratchpad_desc(), aengine, handle); + } + + size_t get_scratchpad_size() const { + return dnnl_memory_desc_get_size(scratchpad_desc()); + } + + memory make_args(int arg_class, const engine& aengine, void* handle) const { + switch (arg_class) { + case DNNL_ARG_SRC: + return make_src(aengine, handle); + case DNNL_ARG_WEIGHTS: + return make_weight(aengine, handle); + case DNNL_ARG_SCRATCHPAD: + return make_scratchpad(aengine, handle); + case DNNL_ARG_DST: + return make_dst(aengine, handle); + case DNNL_ARG_BIAS: + return make_bias(aengine, handle); + default: + TORCH_INTERNAL_ASSERT( + false, "unsupported argument class for primitive_ext"); + } + } + + template + void set_attribute(int slot, int arg_class, void* handle, M constructor) { + if (mem_arg_cache[slot]) + mem_arg_cache[slot].set_data_handle(handle); + else { + mem_arg_cache[slot] = constructor(); + c_args[slot].arg = arg_class; + c_args[slot].memory = mem_arg_cache[slot].get(); + } + } + + sycl::event execute( + const stream& astream, + const engine& aengine, + std::vector>&& handles, + int slot_off = 2) { + auto off = slot_off; + for (const auto& p : handles) { + auto& m_arg = mem_arg_cache[off]; + if (m_arg) + m_arg.set_data_handle(p.second); + else { + m_arg = make_args(p.first, aengine, p.second); + c_args[off].arg = p.first; + c_args[off].memory = m_arg.get(); + } + ++off; + } + + sycl::event return_event; + std::vector deps{}; + error::wrap_c_api( + dnnl_sycl_interop_primitive_execute( + this->get(), astream.get(), off, c_args, &deps, &return_event), + "could not execute a primitive"); + return return_event; + } + + private: + memory mem_arg_cache[max_args]; + dnnl_exec_arg_t c_args[max_args]; +}; + +// Specifies the combined data types of input and weight tensors. +// For example, f32 means both input and weight are FP32, +// bf16_int4 means input is BF16 and weight is INT4. +enum class joint_dtypes_t { f32 = 0, f16, bf16, int8, f16_int4, bf16_int4 }; + +// Specifies the transposition state of input and weight tensors. +// Convention: first letter = input, second letter = weight. +// 'n' = not transposed, 't' = transposed. +// For example, 'nt' means input is not transposed, weight is transposed. +enum class trans_type_t { nn = 0, nt, tn, tt }; + +// Specifies the type and placement of bias in the computation. +// 'none' = no bias, +// 'scalar' = a single scalar bias applied to all elements, +// 'm' = per-row bias (typically matched to input rows), +// 'n' = per-column bias (typically matched to output channels), +// 'mn' = full bias matrix matching the output dimensions. +enum class bias_type_t { none = 0, scalar, m, n, mn }; + +template +T concat(const T& t1, at::ScalarType d) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.push_back((int64_t)d); + + return t; +} + +template +T concat(const T& t1, bool b) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.push_back(b); + + return t; +} + +template +T concat(const T& t1, int b) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.push_back(b); + + return t; +} + +template +T concat(const T& t1, const T& t2) { + T t; + t.insert(t.end(), t1.begin(), t1.end()); + t.insert(t.end(), t2.begin(), t2.end()); + + return t; +} + +template +T1 concat(const T1& t1, const T2& t2, const Ts&... ts) { + return concat(concat(t1, t2), ts...); +} + +template +struct onednn_types_mapper; + +template <> +struct onednn_types_mapper { + static inline std::tuple + get() { + return std::make_tuple( + dnnl::memory::data_type::f16, dnnl::memory::data_type::u4); + } +}; + +template <> +struct onednn_types_mapper { + static inline std::tuple + get() { + return std::make_tuple( + dnnl::memory::data_type::bf16, dnnl::memory::data_type::u4); + } +}; + +// TODO: bias types maybe not right +static inline dnnl::memory::dims get_bias_type( + bias_type_t b_dims, + const int m, + const int n) { + switch (b_dims) { + case bias_type_t::none: + return {0}; + case bias_type_t::scalar: + return {1, 1}; + case bias_type_t::m: + return {m, 1}; + case bias_type_t::n: + return {1, n}; + case bias_type_t::mn: + return {m, n}; + default: + TORCH_INTERNAL_ASSERT(false, "unsupported bias type ..."); + } +} + +// TODO: use template specialization on struct +template +inline void get_strides( + memory::dims& src_strides, + memory::dims& wei_strides, + memory::dims& dst_strides, + const int64_t lda, + const int64_t ldb, + const int64_t ldc) {} + +template <> +inline void get_strides( + memory::dims& src_strides, + memory::dims& wei_strides, + memory::dims& dst_strides, + const int64_t lda, + const int64_t ldb, + const int64_t ldc) { + src_strides = {lda, 1}; + wei_strides = {1, ldb}; + dst_strides = {ldc, 1}; +} + +using primitive_cache = + at::native::onednn::lru_cache; + +template +struct matmul_primitive_cache_t { + static inline primitive_ext& get( + const int m, + const int n, + const int k, + const int64_t lda, + const int64_t ldb, + const int64_t ldc, + const bias_type_t + b_dims, // for shapeless bias, not put it into template parameter + const int device_id, + F f_attr, + const int64_t scale_group_size, + const int64_t zp_group_size) { + auto& cached = get_cache(device_id); + memory::dims src_strides, wei_strides, dst_strides; + get_strides(src_strides, wei_strides, dst_strides, lda, ldb, ldc); + auto pri_key = at::native::onednn::concat( + src_strides, + wei_strides, + m, + n, + k, + int(b_dims), + int(scale_group_size), + int(zp_group_size)); + auto iter = cached.find(pri_key); + if (iter == cached.end()) { + auto [src_dt, wei_dt] = onednn_types_mapper::get(); + auto bias_dims = get_bias_type(b_dims, m, n); + + auto src_md = memory::desc({m, k}, src_dt, src_strides); + auto wei_md = memory::desc({k, n}, wei_dt, wei_strides); + auto dst_md = memory::desc({m, n}, src_dt, dst_strides); + auto bias_format = b_dims == bias_type_t::none + ? dnnl::memory::format_tag::undef + : dnnl::memory::format_tag::ab; + auto bias_md = + memory::desc(bias_dims, src_dt, bias_format); // {m, n} or {1, n} + + primitive_attr pattr; + f_attr(pattr); + + dnnl::matmul::primitive_desc matmul_pd; + auto aengine = + at::native::onednn::GpuEngineManager::Instance().get_engine( + device_id); + if (b_dims == bias_type_t::none) { + matmul_pd = dnnl::matmul::primitive_desc( + aengine, src_md, wei_md, dst_md, pattr); + } else { + matmul_pd = dnnl::matmul::primitive_desc( + aengine, src_md, wei_md, bias_md, dst_md, pattr); + } + + return cached.insert({pri_key, primitive_ext(dnnl::matmul(matmul_pd))}) + .first->second; + } else { + return iter->second; + } + } + + private: + static constexpr int max_cache_capacity = 512; + // if default constructor of primitive cache could read the environment + // variable then it'll save a lot of trouble + static inline thread_local std::array mappings; + + // this won't be needed if primitive_cache have good default constructor + static inline primitive_cache& get_cache(const int device_id) { + auto& mapping = mappings[device_id]; + if (mapping.max_size() == 0) { + mapping.resize(max_cache_capacity); + } + return mapping; + } +}; + +template +static inline primitive_ext& matmul_primitive_create_and_cache( + const trans_type_t Tt, + const bias_type_t b_dims, + const int m, + const int n, + const int k, + const int64_t lda, + const int64_t ldb, + const int64_t ldc, + const int device_id, + F attr, + const int64_t scale_group_size, + const int64_t zp_group_size) { + switch (Tt) { + case trans_type_t::nt: + return matmul_primitive_cache_t::get( + m, + n, + k, + lda, + ldb, + ldc, + b_dims, + device_id, + attr, + scale_group_size, + zp_group_size); + default: + TORCH_INTERNAL_ASSERT(false, "unsupported trans type ..."); + } +} + +template +static inline primitive_ext& matmul_primitive_create_and_cache( + const joint_dtypes_t Ts, + const trans_type_t Tt, + const bias_type_t b_dims, + const int m, + const int n, + const int k, + const int64_t lda, + const int64_t ldb, // is weight ldb necessary? + const int64_t ldc, + const int device_id, + F attr, + const int64_t scale_group_size = 0, + const int64_t zp_group_size = 0) { + switch (Ts) { + case joint_dtypes_t::f16_int4: + return matmul_primitive_create_and_cache( + Tt, + b_dims, + m, + n, + k, + lda, + ldb, + ldc, + device_id, + attr, + scale_group_size, + zp_group_size); + case joint_dtypes_t::bf16_int4: + return matmul_primitive_create_and_cache( + Tt, + b_dims, + m, + n, + k, + lda, + ldb, + ldc, + device_id, + attr, + scale_group_size, + zp_group_size); + default: + TORCH_INTERNAL_ASSERT(false, "Only support int4 ..."); + } +} + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/LRUCache.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/LRUCache.h new file mode 100644 index 0000000000000000000000000000000000000000..b4de22cd3fcca003c280d5d00acee4fefbec685b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/LRUCache.h @@ -0,0 +1,115 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native::onednn { + +template < + class key_t, + class value_t, + template class map_t = std::unordered_map> +class lru_cache { + public: + using value_type = std::pair; + using list_type = std::list; + using list_iter = typename list_type::iterator; + using map_type = map_t; + using const_list_iter = typename list_type::const_iterator; + using size_type = typename list_type::size_type; + + explicit lru_cache(size_type capacity) : capacity_(capacity) {} + lru_cache() : capacity_(0) {} + + [[nodiscard]] size_type size() const noexcept { + return map_.size(); + } + [[nodiscard]] size_type max_size() const noexcept { + return capacity_; + } + [[nodiscard]] bool empty() const noexcept { + return vlist_.empty(); + } + + void resize(size_type new_capacity) { + capacity_ = new_capacity; + trim(); + } + + list_iter begin() noexcept { + return vlist_.begin(); + } + const_list_iter begin() const noexcept { + return vlist_.begin(); + } + list_iter end() noexcept { + return vlist_.end(); + } + const_list_iter end() const noexcept { + return vlist_.end(); + } + + void clear() noexcept { + map_.clear(); + vlist_.clear(); + } + + void swap(lru_cache& other) noexcept { + using std::swap; + swap(vlist_, other.vlist_); + swap(map_, other.map_); + swap(capacity_, other.capacity_); + } + + list_iter find(const key_t& key) { + auto it = map_.find(key); + if (it == map_.end()) + return end(); + vlist_.splice(vlist_.begin(), vlist_, it->second); + return it->second; + } + + std::pair insert(const value_type& value) { + auto it = map_.find(value.first); + if (it != map_.end()) { + // Move existing to front + vlist_.splice(vlist_.begin(), vlist_, it->second); + return {it->second, false}; + } + + // Insert new at front + vlist_.emplace_front(value); + map_[value.first] = vlist_.begin(); + + trim(); + + return {vlist_.begin(), true}; + } + + list_iter erase(list_iter pos) { + map_.erase(pos->first); + return vlist_.erase(pos); + } + + private: + void trim() { + while (map_.size() > capacity_) { + auto last = std::prev(vlist_.end()); + map_.erase(last->first); + vlist_.pop_back(); + } + } + + list_type vlist_; + map_type map_; + size_type capacity_; +}; + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Utils.h new file mode 100644 index 0000000000000000000000000000000000000000..d1378f0f11c1d77227cc12daadbfa5e1e1e61e87 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/Utils.h @@ -0,0 +1,172 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define ONEDNN_SUPPORT_DETERMINISTIC \ + (DNNL_VERSION_MAJOR >= 3 && DNNL_VERSION_MINOR >= 4) + +namespace at::native::onednn { + +dnnl::memory::format_tag get_dnnl_default_format( + int ndims, + bool is_channels_last = false, + bool allow_undef = false); + +dnnl::memory::data_type get_onednn_dtype( + const at::Tensor& tensor, + bool allow_undef = false); + +dnnl::memory::data_type get_onednn_dtype_include_double( + const at::Tensor& tensor, + bool allow_undef = false); + +bool is_supported_onednn_dtype(const at::Tensor& tensor); + +dnnl::memory::dims get_onednn_dims(const at::Tensor& tensor); + +dnnl::memory::dims get_onednn_strides(const at::Tensor& tensor); +dnnl::memory::desc get_onednn_md(const at::Tensor& tensor); + +bool onednn_strides_check(const at::Tensor& src); +bool is_broadcast(const at::Tensor& t); +void undo_broadcast_on_batch(at::Tensor& m1, at::Tensor& m2); +void undo_broadcast(at::Tensor& tensor); + +bool is_onednn_matmul_strides(const at::Tensor& tensor); + +bool is_broadcast_from_other_to_self( + const at::Tensor& self, + const at::Tensor& other); + +at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim); + +void apply_tf32_if_allowed(dnnl::primitive_attr& primitive_attr); + +bool binary_valid( + const at::Tensor& self, + const at::Tensor& other, + bool is_fusion = false); + +bool use_channels_last_for_conv( + const at::Tensor& src, + const at::Tensor& weight); + +dnnl::memory::format_tag conv_src_fmt( + const int64_t ndim, + const bool is_channels_last = false); + +dnnl::memory::dims compatible_weight_dims( + const int64_t ndim, + const int64_t groups, + const int64_t oc, + const int64_t ic, + const IntArrayRef wsizes); + +dnnl::memory::format_tag conv_weight_fmt( + const int64_t ndim, + const bool grouped = false, + const bool is_channels_last = false); + +template +dnnl::memory::dims compatible_dilation(Vec&& dilation) { + dnnl::memory::dims ret = dilation.vec(); + for (auto it = ret.begin(); it != ret.end(); it++) { + *it -= 1; + } + return ret; +} + +inline std::vector padding_r( + IntArrayRef padding, + IntArrayRef output_padding) { + // ConvTranspose padding adjustment + // + // PyTorch uses padding/output_padding: + // osize = (isize - 1) * stride - 2 * padding + dilation * (kernel_size - 1) + // + output_padding + 1 + // + // MKLDNN uses padding_l/padding_r: + // osize = (isize - 1) * stride - padding_l - padding_r + dilation * + // (kernel_size - 1) + 1 + // + // So: padding_l = padding, padding_r = padding - output_padding + // + auto dim = padding.size(); + std::vector pad_r(dim); + for (const auto d : c10::irange(dim)) { + pad_r[d] = padding[d] - output_padding[d]; + } + return pad_r; +} + +template +dnnl::memory dnnl_memory_from_host_scalar( + T host_value, + Tensor& holder, + dnnl::engine& engine) { + auto options = at::TensorOptions() + .dtype(c10::CppTypeToScalarType::value) + .device(kXPU); + holder = at::empty({1}, options).fill_(host_value); + dnnl::memory::desc md = get_onednn_md(holder); + dnnl::memory mem = make_onednn_memory(md, engine, holder.data_ptr()); + return mem; +} + +struct PartitionCache { + std::unordered_map, dnnl::graph::partition> partition_map_{}; + + // The first 8 bits are reserved + // bit 0: is int8 + // bit 1: is uint8 + // bit 2: fp16(0) / bf16(1) + // bit 3: is fp32 + // bit 4: is sdpa pattern + // bit 5: is sdpa backward pattern + // bit 6-7: reserved for future use + // The rest of the bits depend upon the arguments provided + // However, down the line, we might have different bitsets for different + // patterns + enum class BitType : uint8_t { + Int8 = 0, + Uint8 = 1, + Bfloat16 = 2, + Float32 = 3, + SdpaPattern = 4, + SdpaBwdPattern = 5 + }; + + dnnl::graph::partition& insert_partition_cache( + std::bitset<32>& patternID, + dnnl::graph::partition& p) { + partition_map_[patternID] = std::move(p); + return partition_map_[patternID]; + } + std::optional> find_partition( + std::bitset<32>& patternID) { + auto iter = partition_map_.find(patternID); + if (iter != partition_map_.end()) { + return iter->second; + } + return std::nullopt; + } +}; + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNN.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNN.h new file mode 100644 index 0000000000000000000000000000000000000000..ba8983ec0b36d54ca060503b4dade1a73db0a395 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -0,0 +1,225 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native::onednn { + +TORCH_API sycl::event matmul( + at::Tensor& result, + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& b_raw, + bool m2_trans, + Attr attr, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef diff_weight_aten_size, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps = {}); + +TORCH_API void woq_matmul_int4( + at::Tensor& result, // dst, [M, N] + const at::Tensor& mat1_, // src, [M, K] + const at::Tensor& mat2_, // quantized weight, [K/8, N] + const at::Tensor& scale, // [K/group_size, N] + const at::Tensor& zp, // [k/group_size, N] + int64_t group_size, + bool pri_cache = true); + +dnnl::memory::dims conv_dst_size( + int64_t ndim, + IntArrayRef src_tz, + IntArrayRef wgh_tz, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation); + +dnnl::memory::dims deconv_dst_size( + IntArrayRef src_size, + IntArrayRef wgh_size, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + IntArrayRef dst_padding, + int64_t groups); + +at::Tensor quantized_convolution( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + bool transposed, + int64_t groups, + at::Tensor output, + double inv_output_scale, + int64_t output_zero_point, + std::optional accum, + double accum_scale, + int64_t accum_zero_point, + std::optional output_dtype, + std::optional binary_attr, + std::optional binary_alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +void quantized_matmul( + at::Tensor mat1, // act + double input_scale, + int64_t input_zero_point, + at::Tensor mat2, // weight + at::Tensor& weight_scales, + at::Tensor& weight_zero_points, + at::Tensor& b_raw, + at::Tensor result, // output + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::optional other, // extra input for binary-post-op + double other_scale, + int64_t other_zero_point, + const std::string_view& binary_post_op, + double binary_alpha, + const std::string_view& unary_post_op, + torch::List>& unary_post_op_args, + std::string_view unary_post_op_algorithm, + bool m2_trnas); + +void sdpa( + int batch_size, + int seq_len_q, + int seq_len_kv, + int num_head_q, + int num_head_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& query, + const Tensor& key, + const Tensor& value, + std::optional attn_mask, + bool is_causal, + float softmax_scale, + const Tensor& attention, + bool compute_logsumexp, + const Tensor& logsumexp); + +void sdpa_backward( + int batch_size, + int num_head_q, + int num_head_kv, + int seq_len_q, + int seq_len_kv, + int head_dim_qk, + int head_dim_v, + const Tensor& grad_out, + const Tensor& query, + const Tensor& key, + const Tensor& value, + const Tensor& out, + const Tensor& logsumexp, + std::optional attn_mask, + bool is_causal, + double scale, + Tensor& grad_query, + Tensor& grad_key, + Tensor& grad_value); + +sycl::event scaled_matmul( + const Tensor& mat1, + const Tensor& mat2, + Tensor& result, + const Tensor& scale_a, + const Tensor& scale_b, + at::blas::ScalingType scaling_choice_a, + at::blas::ScalingType scaling_choice_b, + const std::optional& bias, + const std::optional& scale_result, + bool use_fast_accum); +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNNContext.h new file mode 100644 index 0000000000000000000000000000000000000000..9c195f19089bd945dcc66eb4f60a0aa3a5e3acfc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/detail/oneDNNContext.h @@ -0,0 +1,95 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace at::native::onednn { + +TORCH_XPU_API dnnl::memory make_onednn_memory( + dnnl::memory::desc md, + dnnl::engine& engine, + void* ptr); + +// Keep non-static and non-inline +bool set_onednn_verbose(int level); + +// GpuEngineManager singleton +struct TORCH_XPU_API GpuEngineManager { + static GpuEngineManager& Instance(); // Singleton + + dnnl::engine& get_engine( + DeviceIndex device_index = c10::xpu::current_device()) { + c10::xpu::check_device_index(device_index); + return *engine_pool[device_index]; + } + + dnnl::engine& get_engine(const Device& device) { + TORCH_INTERNAL_ASSERT(device.type() == kXPU); + return get_engine(device.index()); + } + + GpuEngineManager(GpuEngineManager const&) = delete; + GpuEngineManager& operator=(GpuEngineManager const&) = delete; + GpuEngineManager(GpuEngineManager&&) = default; + GpuEngineManager& operator=(GpuEngineManager&&) = default; + + protected: + GpuEngineManager(); + ~GpuEngineManager() = default; + + private: + std::vector> engine_pool; +}; + +// GpuStreamManager singleton +struct TORCH_XPU_API GpuStreamManager { + static GpuStreamManager& Instance(); // Singleton + + dnnl::stream& get_stream( + DeviceIndex device_index = c10::xpu::current_device()) { + auto stream = c10::xpu::getCurrentXPUStream(device_index); + auto priority = stream.priority(); + if (stream_pool[device_index][priority].find(stream) == + stream_pool[device_index][priority].end()) { + stream_pool[device_index][priority][stream] = + std::make_shared(dnnl::sycl_interop::make_stream( + GpuEngineManager::Instance().get_engine(device_index), + stream.queue())); + } + return *stream_pool[device_index][priority][stream]; + } + + GpuStreamManager(GpuStreamManager const&) = delete; + GpuStreamManager& operator=(GpuStreamManager const&) = delete; + GpuStreamManager(GpuStreamManager&&) = default; + GpuStreamManager& operator=(GpuStreamManager&&) = default; + + protected: + GpuStreamManager() { + c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero(); + stream_pool.resize(device_count); + } + ~GpuStreamManager() = default; + + private: + using stream_hash_map = + ska::flat_hash_map>; + std::vector< + std::array> + stream_pool; +}; + +} // namespace at::native::onednn + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qconv.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qconv.h new file mode 100644 index 0000000000000000000000000000000000000000..64f87b3bf8435db713d71dc72c2ff2a15d75e99c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qconv.h @@ -0,0 +1,115 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native::xpu { +class QConvoneDNNXPU final { + public: + C10_API static at::Tensor run_pointwise( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double inv_output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_binary( + at::Tensor act, + double act_scale, + int64_t act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + C10_API static at::Tensor run_pointwise_binary_tensor( + at::Tensor act, + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + static inline c10::ScalarType qconv_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + + static at::Tensor qconv_prepack_xpu( + at::Tensor weight, + at::Tensor weight_scales, + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape); +}; + +} // namespace at::native::xpu +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qlinear.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qlinear.h new file mode 100644 index 0000000000000000000000000000000000000000..fd591eb16a16481c7f58b101ba675e74e720e4b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mkldnn/xpu/qlinear.h @@ -0,0 +1,96 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native::xpu { + +class QLinearOnednnXPU final { + public: + C10_API static Tensor q_linear_pointwise( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + torch::List> post_op_args, + std::string_view post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary( + Tensor act, + double act_scale, + int64_t act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_pointwise_binary_tensor( + Tensor act, + Tensor act_scale, + Tensor act_zero_point, + Tensor weight, + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, + double binary_alpha, + std::string_view unary_post_op, + torch::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); + + C10_API static Tensor q_linear_prepack_onednn( + at::Tensor weight, + std::optional> input_shape); + + static inline c10::ScalarType qlinear_decide_out_dtype( + const at::Tensor& act, + const std::optional output_dtype); + +}; // class QLinearOnednnXPU + +} // namespace at::native::xpu + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/Copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..69caaa34e35c541dd0c1e8919817947e3adad399 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/Copy.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +namespace at::native::mps { + +at::Tensor& mps_copy_( + at::Tensor& dst, + const at::Tensor& src, + bool non_blocking); +void copy_blit_mps(void* dst, const void* src, size_t size); + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/MPSGraphSequoiaOps.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/MPSGraphSequoiaOps.h new file mode 100644 index 0000000000000000000000000000000000000000..327d5cab3dd7e6a1c3f85ad97c1b2c4284d6fb00 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/MPSGraphSequoiaOps.h @@ -0,0 +1,46 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +#if !defined(__MAC_15_0) && (!defined(MAC_OS_X_VERSION_15_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_15_0)) + +@interface MPSNDArrayIdentity : MPSNDArrayUnaryKernel +- (MPSNDArray* __nullable)reshapeWithCommandBuffer:(__nullable id)cmdBuf + sourceArray:(MPSNDArray* __nonnull)sourceArray + shape:(MPSShape* __nonnull)shape + destinationArray:(MPSNDArray* __nullable)destinationArray; +@end + +@interface MPSNDArrayDescriptor () +@property(readwrite, nonatomic) BOOL preferPackedRows; +@end + +@interface MPSNDArray () +- (nonnull instancetype)initWithBuffer:(id _Nonnull)buffer + offset:(NSUInteger)offset + descriptor:(MPSNDArrayDescriptor* _Nonnull)descriptor; +- (MPSNDArray* __nullable)arrayViewWithShape:(MPSShape* _Nullable)shape strides:(MPSShape* _Nonnull)strides; +@end + +typedef NS_ENUM(NSInteger, MTLMathMode) { + MTLMathModeSafe = 0, + MTLMathModeRelaxed = 1, + MTLMathModeFast = 2, +}; + +typedef NS_ENUM(NSInteger, MTLMathFloatingPointFunctions) { + MTLMathFloatingPointFunctionsFast = 0, + MTLMathFloatingPointFunctionsPrecise = 1, +}; + +@interface MTLCompileOptions () +@property(readwrite, nonatomic) MTLMathMode mathMode; +@property(readwrite, nonatomic) MTLMathFloatingPointFunctions mathFloatingPointFunctions; +@end + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/MetalShaderLibrary.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/MetalShaderLibrary.h new file mode 100644 index 0000000000000000000000000000000000000000..7e7fbc09d74f99130919dd3b5c5467b9006ead14 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/MetalShaderLibrary.h @@ -0,0 +1,202 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#ifdef __OBJC__ +#include +typedef id MTLLibrary_t; +typedef id MTLFunction_t; +typedef id MTLComputePipelineState_t; +typedef id MTLComputeCommandEncoder_t; +#else +typedef void MTLCompileOptions; +typedef void* MTLLibrary_t; +typedef void* MTLFunction_t; +typedef void* MTLComputePipelineState_t; +typedef void* MTLComputeCommandEncoder_t; +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +// Forward declaration of TensorBase and TensorIteratorBase +namespace at { +class TensorBase; +struct TensorIteratorBase; +} // namespace at + +namespace at::native::mps { + +namespace detail { +template +class has_size_type { + template + static constexpr std::true_type check(typename U::size_type*); + template + static constexpr std::false_type check(...); + + public: + static constexpr bool value = decltype(check(nullptr))::value; +}; + +template +constexpr bool has_size_type_v = has_size_type::value; + +} // namespace detail + +// Returns `gpuAddress` of respective `id` plus storage offset +void* get_tensor_gpu_address(const at::TensorBase&); + +class MetalKernelFunction { + public: + MetalKernelFunction(MTLComputePipelineState_t cps_, MTLFunction_t f_); + ~MetalKernelFunction(); + MetalKernelFunction(MetalKernelFunction&) = delete; + // Shader properties + uint64_t getMaxThreadsPerThreadgroup() const; + uint64_t getThreadExecutionWidth() const; + uint64_t getStaticThreadGroupMemoryLength() const; + void runCommandBlock(std::function f); + // Methods below should be called from runCommandBlock function + void startEncoding(); + void setArg(unsigned idx, const at::TensorBase& t); + void setArg(unsigned idx, const void* ptr, uint64_t size); + template < + typename T, + typename = std::enable_if_t< + std::is_integral_v || std::is_same_v || + (std::is_class_v && std::is_trivially_copyable_v && + !detail::has_size_type_v)>> + inline void setArg(unsigned idx, const T val) { + setArg(idx, &val, sizeof(T)); + } + + template < + typename Container, + typename = std::enable_if_t>> + inline void setArg(unsigned idx, const Container& values) { + setArg( + idx, + values.data(), + values.size() * sizeof(typename Container::value_type)); + } + void dispatch( + uint64_t length, + std::optional groupSize = std::nullopt); + void dispatch( + c10::ArrayRef length, + c10::OptionalArrayRef groupSize = std::nullopt); + + private: + MTLComputePipelineState_t cps; + MTLFunction_t func; + MTLComputeCommandEncoder_t encoder = nullptr; +}; + +class MetalShaderLibrary { + public: + MetalShaderLibrary(std::string src) + : shaderSource(std::move(src)), nparams(0), compile_options(nullptr) {} + MetalShaderLibrary(std::string src, unsigned nparams_) + : shaderSource(std::move(src)), + nparams(nparams_), + compile_options(nullptr) {} + MetalShaderLibrary( + std::string src, + unsigned nparams_, + MTLCompileOptions* compile_options_) + : shaderSource(std::move(src)), + nparams(nparams_), + compile_options(compile_options_) {} + MetalShaderLibrary(const MetalShaderLibrary&) = delete; + virtual ~MetalShaderLibrary(); + std::vector getFunctionNames(); + std::shared_ptr getKernelFunction( + const std::string& name); + // Returns a raw pointer to the kernel function for use in C APIs + MetalKernelFunction* getCachedKernelFunctionPtr(const std::string& name); + inline MTLComputePipelineState_t getPipelineStateForFunc( + const std::string& fname) { + return getLibraryPipelineState(getLibrary(), fname).first; + } + MTLComputePipelineState_t getPipelineStateForFunc( + const std::string& fname, + const std::initializer_list& params) { + return getLibraryPipelineState(getLibrary(params), fname).first; + } + inline MTLFunction_t getMTLFunction(const std::string& fname) { + return getLibraryPipelineState(getLibrary(), fname).second; + } + MTLFunction_t getMTLFunction( + const std::string& fname, + const std::initializer_list& params) { + return getLibraryPipelineState(getLibrary(params), fname).second; + } + static MetalShaderLibrary& getBundledLibrary(); + void exec_unary_kernel( + TensorIteratorBase& iter, + const std::string& name, + const std::optional alpha = std::nullopt, + const std::optional scalar_arg_type = std::nullopt); + void exec_binary_kernel( + TensorIteratorBase& iter, + const std::string& name, + const std::optional alpha = std::nullopt, + const std::optional scalar_arg_type = std::nullopt); + void exec_ternary_kernel(TensorIteratorBase& iter, const std::string& name); + + template + void exec_unary_kernel_with_params( + TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name); + template + void exec_binary_kernel_with_params( + TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name); + + protected: + virtual MTLLibrary_t getLibrary(); + virtual MTLLibrary_t getLibrary( + const std::initializer_list& params); + MTLLibrary_t library = nullptr; + + private: + std::pair getLibraryPipelineState( + MTLLibrary_t lib, + const std::string& fname); + MTLLibrary_t compileLibrary(const std::string& src); + std::string shaderSource; + unsigned nparams; + MTLCompileOptions* compile_options; + std::unordered_map libMap; + std::unordered_map< + std::string, + std::pair> + cplMap; + // Cache for kernel functions returned by getCachedKernelFunctionPtr + std::unordered_map> + kernelCache; +}; + +class DynamicMetalShaderLibrary : public MetalShaderLibrary { + public: + DynamicMetalShaderLibrary(const std::string& src) : MetalShaderLibrary(src) { + // Compile right away + getLibrary(); + } + ~DynamicMetalShaderLibrary() override; +}; + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/OperationUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/OperationUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..afb6f3a2737069a80c943685ea1a34d9b2a0bd02 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/OperationUtils.h @@ -0,0 +1,801 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +#include + +@interface MPSGraph (PyTorchFixups) +- (MPSGraphTensor*)minimumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPSGraphTensor*)primaryTensor + secondaryTensor:(MPSGraphTensor*)secondaryTensor + name:(NSString*)name; + +- (MPSGraphTensor*)maximumWithNaNPropagationAndIntFallbackWithPrimaryTensor:(MPSGraphTensor*)primaryTensor + secondaryTensor:(MPSGraphTensor*)secondaryTensor + name:(NSString*)name; +@end + +using namespace at::mps; + +namespace at::native::mps { + +struct MPSScalar { + id getMTLBuffer() const { + return __builtin_bit_cast(id, buffer.get()); + } + + size_t size = 0; + ScalarType type = ScalarType::Undefined; + c10::DataPtr buffer; // stores MTLBuffer (frees buffer if MPSScalar instance goes out of scope) + union { + float f; // MPS doesn't support 'double' + at::Half h; + int64_t i; + bool b; + c10::complex cf; + c10::complex ch; + at::BFloat16 bf16; + } value{}; +}; + +void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results); + +MPSDataType getMPSDataType(ScalarType scalar_type); +static inline MPSDataType getMPSDataType(const TensorBase& t) { + return getMPSDataType(t.scalar_type()); +} +MPSDataType getMPSScalarType(ScalarType scalar_type); +static inline MPSDataType getMPSScalarType(const TensorBase& t) { + return getMPSScalarType(t.scalar_type()); +} +MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type); +std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false); +static inline std::string getMPSTypeString(const TensorBase& t, bool short_name = false) { + return getMPSTypeString(t.scalar_type(), short_name); +} +std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type); +static inline std::string scalarToMetalTypeString(const TensorBase& t) { + return scalarToMetalTypeString(t.scalar_type()); +} +NSArray* getTensorAxes(const TensorBase& t); +NSArray* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim); +std::string getMPSShapeString(MPSShape* shape); +std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false); +std::string to_hex_key(float); +std::string getArrayRefString(const IntArrayRef s); +// use has_storage() on the returned tensor to determine if src actually is a view +Tensor gatherViewTensor(const Tensor& src, Tensor& dst); +Tensor& scatterViewTensor(const Tensor& src, Tensor& output); +MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input); +MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const TensorBase& input); + +MPSNDArray* getStridedMPSNDArray(const TensorBase& src, MPSNDArray* srcNDArray); +MPSNDArray* getMPSNDArray(const TensorBase& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {}); +MPSNDArray* getMPSNDArray(const TensorBase& t, MPSShape* sizes = nil, MPSShape* strides = nil); +// The MPSShape could vary based on memory format +Tensor getTensorView(const Tensor& t, MPSShape* shape); +MPSShape* getMPSShape(const TensorBase& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous); +MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous); + +// Determines whether a tensor is too large to use MPSGraph +bool isTooLargeForMPSGraph(const Tensor& tensor, bool useMPSStridedAPI = true); + +static inline id getMTLBufferStorage(const TensorBase& tensor) { + return __builtin_bit_cast(id, tensor.storage().data()); +} + +class Placeholder { + public: + Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {} + Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {} + Placeholder(MPSGraphTensor* mpsGraphTensor, MPSNDArray* mpsNDArray); + Placeholder(MPSGraphTensor* mpsGraphTensor, + const Tensor& self, + MPSShape* mpsShape = nullptr, + bool gatherTensorData = true, + MPSDataType dataType = MPSDataTypeInvalid, + bool useMPSStridedAPI = true); + MPSGraphTensor* getMPSGraphTensor() { + return _placeholder; + } + MPSGraphTensorData* getMPSGraphTensorData() { + return _value; + } + bool isIntermediate() { + return _value == nullptr; + } + + private: + MPSGraphTensor* _placeholder; + MPSGraphTensorData* _value; + Tensor _tensor; +}; + +void resize_tensor(Tensor* output); +Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device); +MPSGraphTensor* convertNHWCtoNCHW(MPSGraph* mpsGraph, MPSGraphTensor* tensor); +MPSGraphTensor* castMPSTensor(MPSGraph* mpsGraph, MPSGraphTensor* tensor, ScalarType toType); +MPSGraphTensor* castMPSTensor(MPSGraph* mpsGraph, MPSGraphTensor* tensor, MPSDataType toType); +MPSGraphTensorData* getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const TensorBase& tensor); +MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar); + +MPSGraph* make_mps_graph(); + +MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType, MPSShape* mpsShape); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph* mpsGraph, const TensorBase& tensor); +MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph* mpsGraph, const Scalar& scalar); + +std::string get_mem_format_string(c10::MemoryFormat memory_format); + +using MPSCacheKey = uint64_t; + +struct MPSCachedKernel { + MPSCachedKernel(NSObject* object) : _object([object retain]) {} + virtual ~MPSCachedKernel() { + [_object release]; + _object = nullptr; + } + + // Delete copy constructor and assignment + MPSCachedKernel(const MPSCachedKernel&) = delete; + void operator=(const MPSCachedKernel&) = delete; + + template + inline T* kernel() const { + return (T*)_object; + } + + private: + NSObject* _object = nullptr; +}; + +// derive this class to cache a graph and its inputs/outputs +// can be used to store any NSObject +struct MPSCachedGraph { + MPSCachedGraph(NSObject* object) : _object([object retain]) {} + virtual ~MPSCachedGraph() { + [_object release]; + _object = nullptr; + } + + template + inline T* as() { + return static_cast(this); + } + + MPSGraph* graph() const { + return (MPSGraph*)_object; + } + NSObject* object() const { + return _object; + } + + private: + NSObject* _object = nullptr; +}; + +struct MPSUnaryCachedGraph : public MPSCachedGraph { + MPSUnaryCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; +}; + +struct MPSUnaryGradCachedGraph : public MPSCachedGraph { + MPSUnaryGradCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; // some backward input is actually the forward's output + MPSGraphTensor* gradInputTensor_ = nil; +}; + +struct MPSBinaryCachedGraph : public MPSCachedGraph { + MPSBinaryCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* otherTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; +}; + +struct MPSBinaryGradCachedGraph : public MPSCachedGraph { + MPSBinaryGradCachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* gradOutputTensor_ = nil; + MPSGraphTensor* inputTensor_ = nil; + MPSGraphTensor* otherTensor_ = nil; + MPSGraphTensor* gradInputTensor_ = nil; +}; + +struct MPSKernelCache { + typedef MPSCachedKernel* (^CreateCachedKernelBlock)(); + + struct CacheEntry { + CacheEntry(const std::string& key, MPSCachedKernel* cachedKernel) : cachedKernel_(cachedKernel), key_(key) {} + MPSCachedKernel* cachedKernel_ = nullptr; + std::string key_; + }; + + public: + static MPSKernelCache* getInstance() { + if (_instance_cache == nullptr) { + _instance_cache = new MPSKernelCache(); + } + return _instance_cache; + } + + ~MPSKernelCache() { + dispatch_release(serialQueue_); + for (const auto& i : cache_) { + delete i.second.cachedKernel_; + } + } + + // Disallow the copy constructor and operator= functions + MPSKernelCache(const MPSKernelCache&) = delete; + void operator=(const MPSKernelCache&) = delete; + + MPSCachedKernel* CreateCachedKernel(const std::string& key, CreateCachedKernelBlock createCacheBlock) { + __block MPSCachedKernel* cachedKernel = nil; + MPSCacheKey hash = std::hash{}(key); + dispatch_sync_with_rethrow(serialQueue_, ^() { + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached kernel!\n"); + cachedKernel = entry.cachedKernel_; + } else { + cachedKernel = createCacheBlock(); + CacheEntry entry(key, cachedKernel); + cache_.emplace(hash, entry); + } + }); + return cachedKernel; + } + template + inline T* CreateCachedKernelAs(const std::string& key, CreateCachedKernelBlock createCacheBlock) { + return static_cast(CreateCachedKernel(key, createCacheBlock)); + } + + MPSCachedKernel* LookUp(const std::string& key) const { + __block MPSCachedKernel* cachedKernel = nil; + + MPSCacheKey hash = std::hash{}(key); + dispatch_sync_with_rethrow(serialQueue_, ^() { + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached kernel!\n"); + cachedKernel = entry.cachedKernel_; + } + }); + return cachedKernel; + } + + template + inline T* LookUpAs(const std::string& key) const { + return static_cast(LookUp(key)); + } + + private: + MPSKernelCache() { + serialQueue_ = dispatch_queue_create("kernel cache queue", DISPATCH_QUEUE_SERIAL); + } + + static MPSKernelCache* _instance_cache; + std::unordered_map cache_; + dispatch_queue_t serialQueue_ = nullptr; +}; + +// Common template for creating cached kernel if missing +template +inline T* LookUpOrCreateCachedKernel(const std::string& key, std::function instantiate) { + auto cache_ = MPSKernelCache::getInstance(); + if (auto rc = cache_->LookUpAs(key)) { + return rc; + } + return cache_->CreateCachedKernelAs(key, ^mps::MPSCachedKernel*() { + auto k_ = new mps::MPSCachedKernel(instantiate()); + return k_; + }); +} + +// TODO: Improve the overall design of MPSGraphCache. +// https://github.com/pytorch/pytorch/issues/77176 +// Cache holding various keys mapped to graphs +struct MPSGraphCache { + typedef MPSCachedGraph* (^CreateCachedGraphBlock)(); + + struct CacheEntry { + CacheEntry(const std::string& key, MPSCachedGraph* cachedGraph) : cachedGraph_(cachedGraph), key_(key) {} + MPSCachedGraph* cachedGraph_ = nullptr; + std::string key_; + }; + + public: + static MPSGraphCache* getInstance() { + if (_instance_cache == nullptr) { + _instance_cache = new MPSGraphCache(); + } + return _instance_cache; + } + + ~MPSGraphCache() { + dispatch_release(serialQueue_); + + for (const auto& i : cache_) { + delete i.second.cachedGraph_; + } + } + + // Disallow the copy constructor and operator= functions + MPSGraphCache(const MPSGraphCache&) = delete; + void operator=(const MPSGraphCache&) = delete; + + MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + __block MPSCachedGraph* cachedGraph = nil; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync_with_rethrow(serialQueue_, ^() { + // verify the cached entry doesn't already exist + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + cachedGraph = entry.cachedGraph_; + } else { + cachedGraph = createCacheBlock(); + CacheEntry entry(key, cachedGraph); + cache_.emplace(hash, entry); + profileCachedGraph(entry); + } + }); + return cachedGraph; + } + + template + inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + return static_cast(CreateCachedGraph(key, createCacheBlock)); + } + + MPSCachedGraph* LookUp(const std::string& key) const { + __block MPSCachedGraph* cachedGraph = nullptr; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync(serialQueue_, ^() { + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + cachedGraph = entry.cachedGraph_; + profileCachedGraph(entry); + } + }); + return cachedGraph; + } + + template + inline T* LookUpAs(const std::string& key) const { + return static_cast(LookUp(key)); + } + + private: + MPSGraphCache() { + serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL); + } + // this is defined in OperationUtils.mm to not include + // MPSProfiler.h in header OperationUtils.h + void profileCachedGraph(const CacheEntry& cacheEntry) const; + + static MPSGraphCache* _instance_cache; + std::unordered_map cache_; + dispatch_queue_t serialQueue_ = nullptr; +}; + +// Common template for creating graph with a specified cache if missing +template +inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function instantiate) { + auto cache_ = MPSGraphCache::getInstance(); + if (auto rc = cache_->LookUpAs(key)) { + return rc; + } + return cache_->CreateCachedGraphAs(key, ^mps::MPSCachedGraph*() { + T* newCachedGraph = nil; + @autoreleasepool { + // Initialize graph + auto mpsGraph = mps::make_mps_graph(); + newCachedGraph = new T(mpsGraph); + instantiate(mpsGraph, newCachedGraph); + } + return newCachedGraph; + }); +} + +// Common math operations +MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor); + +/** + * Returns distance from lowest to highest element offset in given tensor. + */ +size_t compute_storage_numel_distance(const TensorBase& t); + +/** + * Checks whether tensor is mapped to a contiguous area in the storage. + */ +inline bool is_dense_in_storage(const TensorBase& t) { + return compute_storage_numel_distance(t) == static_cast(t.numel()); +} + +template , encoder_t> || + std::is_same_v, encoder_t>>> +static inline void mtl_setBuffer(encoder_t encoder, const TensorBase& t, unsigned idx) { + if (C10_UNLIKELY(t.device().type() == kCPU)) { + if constexpr (std::is_same_v, encoder_t>) { + TORCH_CHECK(t.dim() == 0, "Passed CPU tensor to MPS op"); + // MPS does not support doubles, silently downcast CPU scalar to float + if (C10_UNLIKELY(t.scalar_type() == kDouble)) { + auto val = static_cast(*reinterpret_cast(t.const_data_ptr())); + [encoder setBytes:&val length:sizeof(val) atIndex:idx]; + return; + } + if (C10_UNLIKELY(t.scalar_type() == kComplexDouble)) { + auto val = static_cast>(*reinterpret_cast*>(t.const_data_ptr())); + [encoder setBytes:&val length:sizeof(val) atIndex:idx]; + return; + } + [encoder setBytes:t.storage().data() length:t.element_size() atIndex:idx]; + } else { + TORCH_CHECK(false, "Passed CPU tensor to MPS op"); + } + return; + } + [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() atIndex:idx]; +} + +// Implementation of setBytes for containers vs trivially copiable types must be separate +// Containers like `std::array` could have been uploaded directly, but `c10::ArrayRef`, +// while trivially copiable, includes padding which if copied as Metal shader parameters +// might overwrite other values +template < + typename T, + typename = std::enable_if_t || std::is_same_v || + (std::is_class_v && std::is_trivially_copyable_v && !detail::has_size_type_v)>> +static inline void mtl_setBytes(id encoder, const T val, unsigned idx) { + [encoder setBytes:&val length:sizeof(T) atIndex:idx]; +} + +template >> +static inline void mtl_setBytes(id encoder, const Container& values, unsigned idx) { + [encoder setBytes:values.data() length:sizeof(typename Container::value_type) * values.size() atIndex:idx]; +} + +static inline void mtl_setBytes(id encoder, const MPSScalar& s, unsigned idx) { + [encoder setBytes:&s.value length:s.size atIndex:idx]; +} + +static size_t iter_tensor_offset(TensorIteratorBase& iter, unsigned idx) { + // At the moment, MPS storage data is not the real GPU pointer, but rather a pointer to id object + // But TensorIterator constructs data_ptr as if base was just a raw pointer + // Workaround this problem by computing an offset from the start of the tensor, which works for both + // tensor views and sliced 64-bit iterators + return reinterpret_cast(iter.data_ptr(idx)) - + reinterpret_cast(iter.tensor_base(idx).storage().data()); +} + +static inline void bind_iter_tensors(id encoder, + TensorIteratorBase& iter, + std::optional ntensors = std::nullopt) { + for (auto idx : c10::irange(ntensors.value_or(iter.ntensors()))) { + auto& t = iter.tensor_base(idx); + // Handle CPU scalars + if (C10_UNLIKELY(t.device().type() == kCPU)) { + mtl_setBuffer(encoder, t, idx); + continue; + } + auto offs = iter_tensor_offset(iter, idx); + [encoder setBuffer:getMTLBufferStorage(t) offset:offs atIndex:idx]; + } +} + +namespace detail { +template +inline void mtl_setArg(id encoder, const T& val, unsigned idx) { + mtl_setBytes(encoder, val, idx); +} + +inline void mtl_setArg(id encoder, id val, unsigned idx) { + [encoder setBuffer:val offset:0 atIndex:idx]; +} + +template <> +inline void mtl_setArg(id encoder, const Tensor& val, unsigned idx) { + mtl_setBuffer(encoder, val, idx); +} + +template <> +inline void mtl_setArg(id encoder, const std::optional& val, unsigned idx) { + if (val.has_value()) { + mtl_setBuffer(encoder, val.value(), idx); + } +} + +template <> +inline void mtl_setArg(id encoder, const TensorBase& val, unsigned idx) { + mtl_setBuffer(encoder, val, idx); +} +// MPS does not support doubles, so cast it down to float before passing as an argument +template <> +inline void mtl_setArg(id encoder, const double& val, unsigned idx) { + float val_f = static_cast(val); + mtl_setBytes(encoder, val_f, idx); +} +} // namespace detail + +template +static inline void mtl_setArgs(id encoder, const T& val) { + detail::mtl_setArg(encoder, val, idx); +} + +template +static inline void mtl_setArgs(id encoder, const T& val, Args&&... args) { + detail::mtl_setArg(encoder, val, idx); + mtl_setArgs(encoder, std::forward(args)...); +} + +static inline void mtl_dispatch1DJob(id encoder, + id cplState, + NSUInteger length) { + static_assert(sizeof(NSUInteger) == sizeof(uint64_t)); + const auto maxThreadsPerGroup = [cplState maxTotalThreadsPerThreadgroup]; + auto size = MTLSizeMake(length, 1, 1); + auto threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, length), 1, 1); + [encoder dispatchThreads:size threadsPerThreadgroup:threadGroupSize]; +} + +id generateKernelDataOffsets(id commandEncoder, + const TensorIteratorBase& iter, + bool use_64bit_index = false); + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) { + return @{p1.getMPSGraphTensor() : p1.getMPSGraphTensorData()}; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) { + return @{ + p1.getMPSGraphTensor() : p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor() : p2.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) { + return @{ + p1.getMPSGraphTensor() : p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor() : p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor() : p3.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) { + return @{ + p1.getMPSGraphTensor() : p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor() : p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor() : p3.getMPSGraphTensorData(), + p4.getMPSGraphTensor() : p4.getMPSGraphTensorData(), + }; +} + +inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) { + runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result)); +} + +// MPS yet to support double types, but starting from MacOS 14, supports bfloat16 +inline bool supportedFloatingType(ScalarType dtype) { + return dtype == kFloat || dtype == kHalf || dtype == kBFloat16; +} + +inline bool supportedFloatingType(const TensorBase& t) { + return supportedFloatingType(t.scalar_type()); +} + +inline bool supportedFloatingOrComplexType(ScalarType dtype) { + if (dtype == kComplexFloat || dtype == kComplexHalf) { + return true; + } + return supportedFloatingType(dtype); +} +inline bool supportedFloatingOrComplexType(const TensorBase& t) { + return supportedFloatingOrComplexType(t.scalar_type()); +} + +inline bool needsGather(const TensorBase& t) { + static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS); + return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset()); +} + +template +void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name) { + using namespace at::mps; + // Decompose 64-bit tensor into 32-bit ones + if (!iter.can_use_32bit_indexing()) { + for (auto&& sub_iter : iter.with_32bit_indexing()) { + exec_unary_kernel_with_params(sub_iter, name, params, params_type_name); + } + return; + } + + auto inputTensor = iter.input(0); + auto outputTensor = iter.output(0); + uint32_t length = iter.numel(); + if (length == 0) { + return; + } + auto kernel_name = fmt::format("{}_{}_{}_{}{}", + name, + iter.is_contiguous() ? "dense" : "strided", + scalarToMetalTypeString(outputTensor), + scalarToMetalTypeString(inputTensor), + fmt::format("_{}", params_type_name)); + @autoreleasepool { + auto cplState = getPipelineStateForFunc(kernel_name); + + MPSStream* mpsStream = getCurrentMPSStream(); + dispatch_sync(mpsStream->queue(), ^() { + auto computeEncoder = mpsStream->commandEncoder(); + + getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor}); + + [computeEncoder setComputePipelineState:cplState]; + bind_iter_tensors(computeEncoder, iter); + if (!iter.is_contiguous()) { + mtl_setArgs<2>(computeEncoder, + outputTensor.sizes(), + inputTensor.strides(), + outputTensor.strides(), + inputTensor.ndimension()); + } + detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6); + mtl_dispatch1DJob(computeEncoder, cplState, length); + + getMPSProfiler().endProfileKernel(cplState); + }); + } +} + +template +void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter, + const std::string& name, + T params, + const std::string& params_type_name) { + using namespace mps; + // TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?) + // Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with + // double as common dtype (because Python floating point are always 64-bit values) + TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS"); + + // Skip for empty iterators + if (iter.numel() == 0) { + return; + } + + // Decompose 64-bit tensor into 32-bit ones + if (!iter.can_use_32bit_indexing()) { + for (auto&& sub_iter : iter.with_32bit_indexing()) { + exec_binary_kernel_with_params(sub_iter, name, params, params_type_name); + } + return; + } + + auto convert_double_scalar = [](Tensor& t) { + if (t.dim() != 0) { + return; + } + if (t.scalar_type() == kDouble) { + t = t.to(kFloat); + } else if (t.scalar_type() == kComplexDouble) { + t = t.to(kComplexFloat); + } + }; + + Tensor input = iter.input(0); + Tensor other = iter.input(1); + Tensor out = iter.output(); + + convert_double_scalar(input); + convert_double_scalar(other); + + MPSStream* mpsStream = getCurrentMPSStream(); + const auto cast_needed = input.scalar_type() != other.scalar_type(); + const auto suffix = iter.is_contiguous() ? "dense" : "strided"; + // TODO: Implicitly pass both input and output types to non-cast kernels + const auto kernel_name = cast_needed + ? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name) + : fmt::format("{}_{}_{}_{}_{}", + name, + suffix, + scalarToMetalTypeString(out), + scalarToMetalTypeString(input), + params_type_name); + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { + auto computeEncoder = mpsStream->commandEncoder(); + auto binaryPSO = getPipelineStateForFunc(kernel_name); + // this function call is a no-op if MPS Profiler is not enabled + getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other}); + [computeEncoder setComputePipelineState:binaryPSO]; + // Set input and output tensors + bind_iter_tensors(computeEncoder, iter); + // Iterator is contiguous if all of its elements are dense in storage, + // i.e. it's true for both row-first and column-first tensors + if (iter.is_contiguous()) { + detail::mtl_setArg(computeEncoder, params, 3); + if (cast_needed) { + std::array size_and_types = {static_cast(c10::elementSize(input.scalar_type())), + static_cast(c10::elementSize(other.scalar_type())), + static_cast(input.scalar_type()), + static_cast(other.scalar_type())}; + mtl_setBytes(computeEncoder, size_and_types, 4); + } + } else { + // Please note that shapes and strides of the iterator might be + // different than that of its operands, for example binary op + // between 4x4 tensor and scalar will result in 1D 16 element iterator + std::array ndim_and_types = {iter.ndim(), + static_cast(input.scalar_type()), + static_cast(other.scalar_type()), + static_cast(out.scalar_type())}; + mtl_setArgs<3>( + computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types); + } + mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel()); + getMPSProfiler().endProfileKernel(binaryPSO); + } + }); +} + +// Checks if one tensor is broadcastable into another +static bool is_dense_broadcastable(const Tensor& from, const Tensor& into) { + if (!from.is_contiguous() || !into.is_contiguous()) { + return false; + } + bool checking_squeezable_dims = false; + for (const auto dim : c10::irange(from.ndimension())) { + if (checking_squeezable_dims) { + if (from.size(-dim - 1) == 1) { + continue; + } + return false; + } + checking_squeezable_dims = from.size(-dim - 1) != into.size(-dim - 1); + } + return true; +} + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/TensorFactory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/TensorFactory.h new file mode 100644 index 0000000000000000000000000000000000000000..8f41c9b54e9a1651f5466ef0d0687a922a24a3e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/TensorFactory.h @@ -0,0 +1,19 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Copyright © 2022 Apple Inc. + +#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) AT_DISPATCH_CASE( \ + at::ScalarType::Half, \ + __VA_ARGS__) AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)) + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Activation.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Activation.h new file mode 100644 index 0000000000000000000000000000000000000000..39742b47768ec19ce12c2b74833b65abff0bab69 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Activation.h @@ -0,0 +1,21 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +template +struct ELUParams { + T alpha; + T scale; + T input_scale; +}; + +template +struct ELUBackwardParams { + T alpha; + T scale; + T input_scale; + bool is_result; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/EmbeddingBag.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/EmbeddingBag.h new file mode 100644 index 0000000000000000000000000000000000000000..e1f50d0950ee478fda8d349164e35a8932ed7264 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/EmbeddingBag.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#ifdef __METAL__ +enum class EmbeddingBagMode { SUM = 0, MEAN, MAX }; +#else +#include +using at::native::EmbeddingBagMode; +#endif + +template +struct EmbeddingBagParams { + ::c10::metal::array weight_strides; + ::c10::metal::array output_strides; + ::c10::metal::array max_indices_strides; + + bool use_per_sample_weights; + idx_type_t per_sample_weights_stride; + + idx_type_t num_indices; + idx_type_t num_bags; + idx_type_t feature_size; + idx_type_t num_weights; + + EmbeddingBagMode mode; + int64_t padding_idx; +}; + +template +struct EmbeddingBagBackwardParams { + ::c10::metal::array weight_grad_strides; + ::c10::metal::array output_grad_strides; + ::c10::metal::array max_indices_strides; + bool use_per_sample_weights; + idx_type_t per_sample_weights_stride; + idx_type_t feature_size; + EmbeddingBagMode mode; + int64_t padding_idx; +}; + +template +struct EmbeddingBagPerSampleWeightsBackwardParams { + ::c10::metal::array output_grad_strides; + ::c10::metal::array weight_strides; + idx_type_t per_sample_weights_grad_stride; + idx_type_t feature_size; + int64_t padding_idx; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/GridSampler.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/GridSampler.h new file mode 100644 index 0000000000000000000000000000000000000000..2d4c3f2beacf9097e0721729e08829cf638d2a05 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/GridSampler.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#ifdef __METAL__ +enum class GridSamplerInterpolation { Bilinear, Nearest, Bicubic }; +enum class GridSamplerPadding { Zeros, Border, Reflection }; +#else +#include +using at::native::GridSamplerInterpolation; +using at::native::GridSamplerPadding; +#endif + +template +struct GridSamplerParams { + int32_t sampler_dims; + ::c10::metal::array output_sizes; + ::c10::metal::array output_strides; + ::c10::metal::array input_sizes; + ::c10::metal::array input_strides; + ::c10::metal::array grid_sizes; + ::c10::metal::array grid_strides; + GridSamplerInterpolation interpolation_mode; + GridSamplerPadding padding_mode; + bool align_corners; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/LinearAlgebra.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/LinearAlgebra.h new file mode 100644 index 0000000000000000000000000000000000000000..238252b54a734dddfc8c34637269accaa98e7ccc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/LinearAlgebra.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +template +struct OrgqrParams { + int32_t num_batch_dims; + + uint32_t m; + uint32_t n; + uint32_t k; + + ::c10::metal::array A_strides; + ::c10::metal::array tau_strides; + ::c10::metal::array H_strides; + ::c10::metal::array H_sizes; +}; + +struct UnpackPivotsParams { + uint32_t perm_batch_stride; + uint32_t pivots_batch_stride; + uint32_t dim_size; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Pooling.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..192805eb79413356fe3a5f89ce0a8fe6fcc06498 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Pooling.h @@ -0,0 +1,66 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +// N is the maximum allowed number of dimensions in the input and outputs. The +// maximum allowed pooling dimensions is N-2, because the input may have up to 2 +// leading dimensions that are not pooled. To support up to 3-D pooling, N=5 is +// the default. +template +struct PoolingParams { + int32_t dims; + int32_t pooling_dims; + ::c10::metal::array input_sizes; + ::c10::metal::array input_strides; + ::c10::metal::array output_sizes; + ::c10::metal::array output_strides; + ::c10::metal::array indices_sizes; + ::c10::metal::array indices_strides; + ::c10::metal::array kernel_size; + ::c10::metal::array stride; + ::c10::metal::array padding; + ::c10::metal::array dilation; + bool return_indices; +}; + +template +struct AvgPoolingParams { + int32_t dims; + int32_t pooling_dims; + ::c10::metal::array input_sizes; + ::c10::metal::array input_strides; + ::c10::metal::array output_sizes; + ::c10::metal::array output_strides; + ::c10::metal::array kernel_size; + ::c10::metal::array stride; + ::c10::metal::array padding; + bool count_include_pad; + bool has_divisor_override; + int32_t divisor_override; +}; + +template +struct PoolingBackwardParams { + int32_t dims; + int32_t pooling_dims; + ::c10::metal::array grad_input_sizes; + ::c10::metal::array grad_input_strides; + ::c10::metal::array grad_output_sizes; + ::c10::metal::array grad_output_strides; + ::c10::metal::array indices_strides; +}; + +template +struct MaxUnpoolingParams { + int32_t dims; + int32_t pooling_dims; + ::c10::metal::array input_sizes; + ::c10::metal::array input_strides; + ::c10::metal::array output_sizes; + ::c10::metal::array output_strides; + ::c10::metal::array indices_strides; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Shape.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Shape.h new file mode 100644 index 0000000000000000000000000000000000000000..e8370f69cd48c67beb1dd044a2392c1766f3441d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/Shape.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +template +struct CatSharedParams { + int32_t ndim; + int32_t cat_dim; + ::c10::metal::array output_strides; + ::c10::metal::array output_sizes; +}; + +template +struct CatInputParams { + idx_type_t cat_dim_offset; + idx_type_t input_element_offset; + ::c10::metal::array input_strides; + ::c10::metal::array input_sizes; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/TensorCompare.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/TensorCompare.h new file mode 100644 index 0000000000000000000000000000000000000000..66970743448e48dd55292912f4137c9eafe251a1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/TensorCompare.h @@ -0,0 +1,12 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +template +struct ClampScalarParams { + T min; + T max; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/UpSample.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/UpSample.h new file mode 100644 index 0000000000000000000000000000000000000000..14bbe274b5139017f303f705aa6d2f29ca810826 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/kernels/UpSample.h @@ -0,0 +1,17 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +template +struct UpsampleParams { + ::c10::metal::array input_strides; + ::c10::metal::array input_sizes; + ::c10::metal::array output_strides; + ::c10::metal::array output_sizes; + ::c10::metal::array scales; + bool align_corners; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/BinaryKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/BinaryKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..bfedae8bb7dbbd5ebb1772612344e9a7de85af32 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/BinaryKernel.h @@ -0,0 +1,15 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +namespace at::native::mps { +void binary_op_kernel( + const std::string func_name, + const Tensor& input, + const Tensor& other, + const Tensor& output, + const std::optional alpha = std::nullopt); +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamAmsgradKernelImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamAmsgradKernelImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..e7d8ab12f3b09e6dddfc87447729916bf1a2c065 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamAmsgradKernelImpl.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native::mps { + +void _fused_adam_amsgrad_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList max_exp_avg_sqs, + TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_amsgrad_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList max_exp_avg_sqs, + TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamKernelImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamKernelImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..457cbbc46c85b4d204d5715672fe054db5229288 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamKernelImpl.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native::mps { + +void _fused_adam_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adam_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList state_steps, + const Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamWAmsgradKernelImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamWAmsgradKernelImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..80d66bc90a748d6fe5c4ed8ad15a7e283e47c294 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamWAmsgradKernelImpl.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native::mps { + +void _fused_adamw_amsgrad_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList max_exp_avg_sqs, + TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_amsgrad_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList max_exp_avg_sqs, + TensorList state_steps, + const Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamWKernelImpl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamWKernelImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..94bc73bb1d5991653f57b16d52e7c48a55fa6904 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/FusedAdamWKernelImpl.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native::mps { + +void _fused_adamw_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +void _fused_adamw_mps_impl_( + TensorList params, + TensorList grads, + TensorList exp_avgs, + TensorList exp_avg_sqs, + TensorList state_steps, + const Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const std::optional& grad_scale, + const std::optional& found_inf); + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/MultiTensorApply.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/MultiTensorApply.h new file mode 100644 index 0000000000000000000000000000000000000000..c156a5789ede7b044a3a10c9b7ca9efec699adc3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mps/operations/MultiTensorApply.h @@ -0,0 +1,367 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include + +static_assert(sizeof(bool) == 1); + +namespace at::native::mps { + +static constexpr int64_t kChunkSize = 65536; +static constexpr int64_t kmaxThreadGroups = 32; +static constexpr int64_t kmaxTensors = 32; + +struct MetadataArguments { // the size of this struct must be less than 4 kilobytes + uint64_t numels[kmaxTensors]; + uint64_t threadgroup_to_tensor[kmaxThreadGroups]; + uint64_t threadgroup_to_chunk[kmaxThreadGroups]; +}; + +struct FusedAdamEncodingFunctor { + void operator()(id& computeEncoder, + id& tensorArgumentBuffer, + const MetadataArguments& metadata_arguments, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize) const { + mtl_setArgs( + computeEncoder, tensorArgumentBuffer, metadata_arguments, lr, beta1, beta2, weight_decay, eps, maximize); + } + + void operator()(id& computeEncoder, + id& tensorArgumentBuffer, + const MetadataArguments& metadata_arguments, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize) const { + mtl_setArgs( + computeEncoder, tensorArgumentBuffer, metadata_arguments, lr, beta1, beta2, weight_decay, eps, maximize); + } +}; + +template +struct FusedSgdEncodingFunctor {}; + +template <> +struct FusedSgdEncodingFunctor { + void operator()(id& computeEncoder, + id& tensorArgumentBuffer, + const MetadataArguments& metadata_arguments, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step) const { + mtl_setArgs(computeEncoder, + tensorArgumentBuffer, + metadata_arguments, + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step); + } + + void operator()(id& computeEncoder, + id& tensorArgumentBuffer, + const MetadataArguments& metadata_arguments, + const double weight_decay, + const double momentum, + const at::Tensor& lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step) const { + mtl_setArgs(computeEncoder, + tensorArgumentBuffer, + metadata_arguments, + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step); + } +}; + +template <> +struct FusedSgdEncodingFunctor { + void operator()(id& computeEncoder, + id& tensorArgumentBuffer, + const MetadataArguments& metadata_arguments, + const double weight_decay, + const double lr, + const bool maximize) const { + mtl_setArgs(computeEncoder, tensorArgumentBuffer, metadata_arguments, weight_decay, lr, maximize); + } + + void operator()(id& computeEncoder, + id& tensorArgumentBuffer, + const MetadataArguments& metadata_arguments, + const double weight_decay, + const at::Tensor& lr, + const bool maximize) const { + mtl_setArgs(computeEncoder, tensorArgumentBuffer, metadata_arguments, weight_decay, lr, maximize); + } +}; + +std::pair, id> getFusedAdamCPLState(const std::string& fname); +template +static void multi_tensor_apply_for_fused_optimizer(const std::string& kernel_name, + std::vector>& tensor_lists, + at::TensorList state_steps, + encoder_func_t encode, + ArgTypes... args) { + const auto num_tensors = tensor_lists[0].size(); + + if (num_tensors == 0) { + return; + } + + TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists has to match the depth"); + for (const auto& d : c10::irange(depth)) { + const auto scalar_type = tensor_lists[d][0].scalar_type(); + TORCH_CHECK(scalar_type == kFloat || scalar_type == kHalf || scalar_type == kBFloat16, + "Only float, bfloat and half are supported"); + } + + id device = MPSDevice::getInstance()->device(); + MPSStream* mpsStream = getCurrentMPSStream(); + + // Remove comment for debugging + /* + mpsStream->addCompletedHandler(^(id cb) { + [cb.logs enumerateObjectsUsingBlock:^(NSString* log, NSUInteger idx, BOOL* stop) { + NSLog(@"MPSStream: %@", log); + } + ]; + }); + */ + + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { + id computeEncoder = mpsStream->commandEncoder(); + auto [fusedOptimizerPSO, fusedOptimizerFunc] = getFusedAdamCPLState(kernel_name); + + // this function call is a no-op if MPS Profiler is not enabled + getMPSProfiler().beginProfileKernel(fusedOptimizerPSO, kernel_name, {tensor_lists[0]}); + + [computeEncoder setComputePipelineState:fusedOptimizerPSO]; + + // BufferIndex is the index in the kernel function + auto tensorArgumentEncoder = [[fusedOptimizerFunc newArgumentEncoderWithBufferIndex:0] autorelease]; + id tensorArgumentBuffer = [[device newBufferWithLength:tensorArgumentEncoder.encodedLength + options:0] autorelease]; + [tensorArgumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0]; + + int64_t tensor_loc = 0; + int64_t threadgroup_loc = 0; + MetadataArguments metadata_arguments; + + for (const auto tensor_index : c10::irange(num_tensors)) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][tensor_index].numel() == 0) { + continue; + } + + for (const auto& d : c10::irange(depth)) { + mtl_setBuffer(tensorArgumentEncoder, tensor_lists[d][tensor_index], d * kmaxTensors + tensor_loc); + [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][tensor_index]) + usage:MTLResourceUsageRead | MTLResourceUsageWrite]; + } + if (!state_steps.empty()) { + mtl_setBuffer(tensorArgumentEncoder, state_steps[tensor_index], depth * kmaxTensors + tensor_loc); + [computeEncoder useResource:getMTLBufferStorage(state_steps[tensor_index]) usage:MTLResourceUsageRead]; + } + metadata_arguments.numels[tensor_loc] = tensor_lists[0][tensor_index].numel(); + + tensor_loc++; + + const auto numel = tensor_lists[0][tensor_index].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + TORCH_CHECK(chunks > -1); + + for (const auto& chunk : c10::irange(chunks)) { + metadata_arguments.threadgroup_to_tensor[threadgroup_loc] = tensor_loc - 1; + metadata_arguments.threadgroup_to_chunk[threadgroup_loc] = chunk; + + threadgroup_loc++; + + const auto tensor_full = tensor_loc == kmaxTensors && chunk == chunks - 1; + // Reach the maximum threadgroups per dispatch + const auto blocks_full = threadgroup_loc == kmaxThreadGroups; + + if (tensor_full || blocks_full) { + encode(computeEncoder, tensorArgumentBuffer, metadata_arguments, args...); + MTLSize gridSize = MTLSizeMake(threadgroup_loc, 1, 1); + uint32_t maxThreadsPerGroup = [fusedOptimizerPSO maxTotalThreadsPerThreadgroup]; + MTLSize threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, kThreadGroupSize), 1, 1); + [computeEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize]; + + // Reset + threadgroup_loc = 0; + if (chunk == chunks - 1) { + // last chunk + tensor_loc = 0; + tensorArgumentBuffer = [[device newBufferWithLength:tensorArgumentEncoder.encodedLength + options:0] autorelease]; + [tensorArgumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0]; + } else { + // reuse the current tensor since the current one isn't done. + metadata_arguments.numels[0] = metadata_arguments.numels[tensor_loc - 1]; + + tensorArgumentBuffer = [[device newBufferWithLength:tensorArgumentEncoder.encodedLength + options:0] autorelease]; + [tensorArgumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0]; + + for (const auto& d : c10::irange(depth)) { + mtl_setBuffer(tensorArgumentEncoder, tensor_lists[d][tensor_index], d * kmaxTensors); + [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][tensor_index]) + usage:MTLResourceUsageWrite | MTLResourceUsageRead]; + } + if (!state_steps.empty()) { + mtl_setBuffer(tensorArgumentEncoder, state_steps[tensor_index], depth * kmaxTensors); + [computeEncoder useResource:getMTLBufferStorage(state_steps[tensor_index]) usage:MTLResourceUsageRead]; + } + tensor_loc = 1; + } + } + } + } + + if (threadgroup_loc != 0) { + encode(computeEncoder, tensorArgumentBuffer, metadata_arguments, args...); + MTLSize gridSize = MTLSizeMake(threadgroup_loc, 1, 1); + uint32_t maxThreadsPerGroup = [fusedOptimizerPSO maxTotalThreadsPerThreadgroup]; + MTLSize threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, kThreadGroupSize), 1, 1); + [computeEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize]; + } + + getMPSProfiler().endProfileKernel(fusedOptimizerPSO); + } + }); +} + +std::pair, id> getAmpCPLState(const std::string& fname); +template +void multi_tensor_apply(const std::string& kernel_name, + std::vector>& tensor_lists, + ArgTypes... args) { + const auto num_tensors = tensor_lists[0].size(); + if (num_tensors == 0) { + return; + } + + TORCH_CHECK(tensor_lists.size() == depth, "Number of tensor lists must match depth."); + + id device = MPSDevice::getInstance()->device(); + MPSStream* mpsStream = getCurrentMPSStream(); + + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { + id computeEncoder = mpsStream->commandEncoder(); + auto [pipeline, function] = getAmpCPLState(kernel_name); + [computeEncoder setComputePipelineState:pipeline]; + + id argumentEncoder = [function newArgumentEncoderWithBufferIndex:0]; + auto tensorArgumentBuffer = [[device newBufferWithLength:argumentEncoder.encodedLength options:0] autorelease]; + [argumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0]; + + int tensor_loc = 0; + int threadgroup_loc = 0; + MetadataArguments metadata_arguments; + std::memset(&metadata_arguments, 0, sizeof(metadata_arguments)); + + for (size_t t = 0; t < num_tensors; t++) { + if (tensor_lists[0][t].numel() == 0) + continue; + + // bind each tensor in this list to the correct slots across depths + for (int d = 0; d < depth; d++) { + mtl_setBuffer(argumentEncoder, tensor_lists[d][t], d * kmaxTensors + tensor_loc); + [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][t]) + usage:(MTLResourceUsageRead | MTLResourceUsageWrite)]; + } + + // save number of elements for this tensor + metadata_arguments.numels[tensor_loc] = tensor_lists[0][t].numel(); + int currentTensorIndex = tensor_loc; + tensor_loc++; + + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + ((numel % kChunkSize) ? 1 : 0); + + // process tensor in chunks based on max chunk size + for (uint chunk = 0; chunk < chunks; chunk++) { + metadata_arguments.threadgroup_to_tensor[threadgroup_loc] = currentTensorIndex; + metadata_arguments.threadgroup_to_chunk[threadgroup_loc] = chunk; + threadgroup_loc++; + + // dispatch when we've filled the threadgroup array or finished the chunks + const bool dispatch_now = (threadgroup_loc == kmaxThreadGroups) || (chunk == chunks - 1); + if (dispatch_now) { + // check for a partial dispatch (i.e. more chunks remain for the current tensor) + bool partial = (chunk != chunks - 1); + uint carried_numels = 0; + if (partial) { + carried_numels = metadata_arguments.numels[currentTensorIndex]; + } + + mtl_setArgs(computeEncoder, tensorArgumentBuffer, metadata_arguments, args...); + MTLSize gridSize = MTLSizeMake(threadgroup_loc, 1, 1); + uint32_t maxThreads = [pipeline maxTotalThreadsPerThreadgroup]; + MTLSize threadGroupSize = MTLSizeMake(std::min(maxThreads, (uint32_t)64), 1, 1); + [computeEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize]; + + // prepare for the next batch: reset threadgroup count and create a new buffer + threadgroup_loc = 0; + tensorArgumentBuffer = [[device newBufferWithLength:argumentEncoder.encodedLength options:0] autorelease]; + [argumentEncoder setArgumentBuffer:tensorArgumentBuffer offset:0]; + + if (partial) { + // for a partial dispatch, rebind the partially processed tensor to slot 0 + // so that its metadata is in the correct location + for (int d = 0; d < depth; d++) { + mtl_setBuffer(argumentEncoder, tensor_lists[d][t], d * kmaxTensors + 0); + [computeEncoder useResource:getMTLBufferStorage(tensor_lists[d][t]) + usage:(MTLResourceUsageRead | MTLResourceUsageWrite)]; + } + metadata_arguments.numels[0] = carried_numels; + // the currently processed tensor now lives at index 0 + currentTensorIndex = 0; + tensor_loc = 1; + } else { + tensor_loc = 0; + } + } + } + } + + if (threadgroup_loc != 0) { + mtl_setArgs(computeEncoder, tensorArgumentBuffer, metadata_arguments, args...); + MTLSize gridSize = MTLSizeMake(threadgroup_loc, 1, 1); + uint32_t maxThreads = [pipeline maxTotalThreadsPerThreadgroup]; + MTLSize threadGroupSize = MTLSizeMake(std::min(maxThreads, static_cast(64)), 1, 1); + [computeEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize]; + } + } + }); +} + +} // namespace at::native::mps + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mtia/EmptyTensor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mtia/EmptyTensor.h new file mode 100644 index 0000000000000000000000000000000000000000..56150f713f660effa21dd39b99231848bdd71e2d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/mtia/EmptyTensor.h @@ -0,0 +1,47 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) + +#pragma once +#include + +namespace at::detail { + +TensorBase empty_mtia( + IntArrayRef size, + ScalarType dtype, + std::optional device_opt, + std::optional memory_format_opt); + +TensorBase empty_mtia( + IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); + +TensorBase empty_mtia(IntArrayRef size, const TensorOptions& options); + +TensorBase empty_strided_mtia( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + std::optional device_opt); + +TensorBase empty_strided_mtia( + IntArrayRef size, + IntArrayRef stride, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt); + +TensorBase empty_strided_mtia( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions& options); + +} // namespace at::detail + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..f6c30fe021376ab28fb17c9e7490a500d7a6a199 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h @@ -0,0 +1,23 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at::native { + +enum class NESTED_DENSE_OP : uint8_t { ADD, MUL }; + +using nested_dense_elementwise_fn = void (*)( + Tensor& result, + const Tensor& self, + const Tensor& other, + const NESTED_DENSE_OP& op); + +DECLARE_DISPATCH(nested_dense_elementwise_fn, nested_dense_elementwise_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h new file mode 100644 index 0000000000000000000000000000000000000000..d90605b91eb7ba0e0f5b415720aa74b0cdb3dacc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h @@ -0,0 +1,84 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +TORCH_API Tensor NestedTensor_to_padded_tensor_generic( + const Tensor& t, + double padding, + OptionalIntArrayRef output_size); + +template +Tensor map_nt(const Tensor& nt, Func f) { + auto* nt_impl = get_nested_tensor_impl(nt); + const auto& sizes = nt_impl->get_nested_sizes(); + return at::detail::make_tensor(f(nt_impl->get_buffer()), sizes); +} +template +Tensor map_nt_binary(const Tensor& nt_1, const Tensor& nt_2, Func f){ + auto* nt_impl_1 = get_nested_tensor_impl(nt_1); + auto* nt_impl_2 = get_nested_tensor_impl(nt_2); + const auto& sizes = nt_impl_1->get_nested_sizes(); + return at::detail::make_tensor(f(nt_impl_1->get_buffer(), nt_impl_2->get_buffer()), sizes); +} + +C10_ALWAYS_INLINE std::pair _check_nested_layer_norm_inputs( + const NestedTensorImpl& input, + IntArrayRef normalized_shape, + const Tensor& weight /* optional */, + const Tensor& bias /* optional */) { + + const size_t normalized_ndim = normalized_shape.size(); + TORCH_CHECK( + normalized_ndim >= 1, + "Expected normalized_shape to be at least 1-dimensional, i.e., ", + "containing at least one element, but got normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !weight.defined() || weight.sizes().equals(normalized_shape), + "Expected weight to be of same shape as normalized_shape, but got ", + "weight of shape ", + weight.sizes(), + " and normalized_shape = ", + normalized_shape); + TORCH_CHECK( + !bias.defined() || bias.sizes().equals(normalized_shape), + "Expected bias to be of same shape as normalized_shape, but got ", + "bias of shape ", + bias.sizes(), + " and normalized_shape = ", + normalized_shape); + + // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input + // Also, compute M and N considering the idiosyncrasies of NestedTensors + int64_t N = 1; + for (const auto i: c10::irange(normalized_ndim)) { + TORCH_CHECK( + input.opt_size(-normalized_ndim + i).has_value(), + "normalized_shape extends into irregular dimensions for the nested tensor" + ); + TORCH_CHECK( + normalized_shape[i] == input.opt_size(-normalized_ndim + i), + "The shape at dimension ", + i, + "of normalized_shape doesn't match the input" + ); + N *= normalized_shape[i]; + } + + const int64_t M = input.numel() / N; + + return std::make_pair(M, N); +} + +Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..6377cd53b04fa6681ad59e2fa8a2f86a514b55f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h @@ -0,0 +1,108 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/** + * Transformer-specific NestedTensor utility functions. + * + * Not co-located with NestedTensor core code yet because they only + * support specific cases needed in transformers. + */ +#pragma once + +#include + +#include +#include + +namespace c10 { +class Scalar; +} // namespace c10 + +namespace at { +class Tensor; +namespace native { +struct NestedTensorImpl; + +// Requires that self is a contiguous NestedTensor, other is not a +// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self +// must have a consistent last dimension across its included Tensors +// and that dimension must match other.size(0). +Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other); + +// Requires that mat1 is a contiguous NestedTensor, self & mat2 are +// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1 +// has a consistent last dimension across its included Tensors that +// matches mat2.size(0). +Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const c10::Scalar& beta, + const c10::Scalar& alpha, + std::optional use_gelu = std::nullopt); + +Tensor NestedTensor_add_NestedTensor_in_place( + const Tensor& self, + const Tensor& other); + +TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor( + const Tensor& sizes, + int64_t extra_elements); + +Tensor NestedTensor_from_padded_tensor_cpu( + const Tensor& padded, + const NestedTensorImpl& nt); + +TORCH_API Tensor NestedTensor_to_mask(const Tensor& nt, std::optional mask_dim, std::optional mask_dim_length); + +template +void remove_padding_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int64_t output_dim, + const int64_t batch_size); + +template +void remove_padding_transform0213_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int64_t output_dim, + const int64_t batch_size); + +template +void add_padding_kernelLauncher( + T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); + +TORCH_API Tensor flash_attention_helper( + const Tensor& query, + const Tensor& key, + const Tensor& value, + double dropout_p, + bool need_attn_weights, + bool is_causal); + +TORCH_API std::tuple mem_efficient_helper_nested_unpacked( + const Tensor& query, + const Tensor& key, + const Tensor& value, + double dropout_p, + bool need_attn_weights, + bool is_causal); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..d0f72ac63bcf4f3abbcf35169e4fc6d6b6f6be73 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native::preprocessing { + +/** + * This function will take nested query, key, and value + * and will preprocess it in order to run with either + * the flash-attention or efficient-attention kernels. + * @return A tuple containing all the necessary data for running the fused + * kernels + */ +std::tuple +sdpa_nested_preprocessing( + const Tensor& query, + const Tensor& key, + const Tensor& value); + +/** + * This function will take nested query, key, and value, grad_out, and out + * and will preprocess it in order to run with either + * the flash-attention or efficient-attention kernels backwards. + * We use both functions to avoid having to do the same preprocessing + * for cumulative_sequence_length_q and cumulative_sequence_length_kv + * @return A tuple containing all the necessary data for running the fused + * kernels + */ +std::tuple +sdpa_nested_preprocessing_backward( + const at::Tensor& grad_out_, + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value, + const at::Tensor& out, + const Tensor& cumulative_sequence_length_q, + const Tensor& cumulative_sequence_length_kv, + const int64_t max_seqlen_batch_q, + const int64_t max_seqlen_batch_kv); + +} // namespace at::native::preprocessing + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..155f76d158df37f8202a8892427b08fa8220606d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h @@ -0,0 +1,454 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS + +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + +#include +#include +#include + +namespace at::native { +struct NestedTensorImpl; + +// The following functions are used to construct nested tensors from buffers and +// metadata. + +inline at::Tensor wrap_buffer(const at::Tensor& buffer, const at::Tensor& nested_sizes) { + TORCH_CHECK( + buffer.dim() == 1, + "Expected given buffer to be 1dim, but got ", + buffer.dim(), + " instead."); + TORCH_CHECK( + buffer.is_contiguous(), "Expected given buffer to be contiguous."); + return at::detail::make_tensor( + buffer, nested_sizes); +} + +// TODO: Figure out if we need a non-moving wrap_buffer() +inline at::Tensor wrap_buffer( + const at::Tensor& buffer, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets) { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + buffer.is_contiguous(), "Given buffer must be contiguous."); + return at::detail::make_tensor( + buffer, + std::move(nested_sizes), + std::move(nested_strides), + std::move(storage_offsets)); +} + +inline at::Tensor get_buffer(const at::Tensor& tensor) { + return get_nested_tensor_impl(tensor)->get_buffer(); +} + +/** + * Create a new nested tensor that is a view of a base nested tensor + * + * create_view_tensor calls a specialized constructor that copies the + * keys from base onto the new view tensor being created. + * The storage is shared between the base and the returned view tensor + * + * All callers of this helper must: + * - Only return a view of the input + * - Must be explicit and define a derivative + * + * @param base Base tensor to construct view from. + * @param nested_sizes View tensors' sizes. + * @param nested_strides View tensors' strides. + * @param storage_offsets View tensors' offsets. + * @return A newly constructed view tensor + */ +inline at::Tensor create_nested_view_tensor( + const at::Tensor& base, + at::Tensor nested_sizes, + at::Tensor nested_strides, + at::Tensor storage_offsets) { + TORCH_INTERNAL_ASSERT( + base.is_nested(), + "This function can only be used to create nested tensor views"); + TORCH_INTERNAL_ASSERT( + c10::impl::tls_local_dispatch_key_set().excluded_.has( + c10::DispatchKey::AutogradFunctionality), + "Creating a non differentiable nested tensor view in a CompositeImplicit function is not allowed."); + return at::detail::make_tensor( + c10::TensorImpl::VIEW, + base, + std::move(nested_sizes), + std::move(nested_strides), + std::move(storage_offsets)); +} +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +// Helper functions for getting information about a nested tensor's shape. + +int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt); + +// The sizes of the underlying tensors +inline std::vector NestedTensor_get_sizes( + const NestedTensorImpl* self_ptr) { + int64_t ntensors = self_ptr->size(0); + std::vector sizes(ntensors); + if (ntensors == 0) { + return sizes; + } + const Tensor& sizemat = self_ptr->get_nested_sizes(); + int64_t orig_dim = sizemat.size(1); + // nesting scalars has empty sizes + if (orig_dim == 0) { + return sizes; + } + const int64_t* sizemat_ptr = sizemat.const_data_ptr(); + + for (const auto i : c10::irange(ntensors)) { + sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim); + sizemat_ptr += orig_dim; + } + return sizes; +} + +TORCH_API std::vector NestedTensor_get_max_size( + const NestedTensorImpl& nt); + +std::vector NestedTensor_get_max_size_from_size_tensor( + const Tensor& sizes); + +inline std::vector NestedTensor_get_sizes(const at::Tensor& self) { + const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self); + return NestedTensor_get_sizes(self_ptr); +} +// The strides of the underlying tensors +inline std::vector NestedTensor_get_strides( + const NestedTensorImpl* self_ptr) { + int64_t ntensors = self_ptr->size(0); + std::vector strides(ntensors); + if (ntensors == 0) { + return strides; + } + const Tensor& stridemat = self_ptr->get_nested_strides(); + int64_t orig_dim = stridemat.size(1); + // nesting scalars has empty strides + if (orig_dim == 0) { + return strides; + } + const int64_t* stridemat_ptr = stridemat.const_data_ptr(); + for (const auto i : c10::irange(ntensors)) { + strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim); + stridemat_ptr += orig_dim; + } + return strides; +} + +inline std::vector NestedTensor_get_strides( + const at::Tensor& self) { + const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self); + return NestedTensor_get_strides(self_ptr); +} + +inline void check_numel_equals_buffer_size(const at::Tensor& self) { + auto self_impl = get_nested_tensor_impl(self); + TORCH_CHECK( + self.numel() == static_cast(self_impl->get_buffer_size()), + "Number of elements in nested tensor must match number of elements in buffer."); +} + +inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) { + TORCH_CHECK( + self_ptr->numel() == static_cast(self_ptr->get_buffer_size()), + "Number of elements in nested tensor must match number of elements in buffer."); +} + +// Helper function to get size / stride / offset for a nested/normal tensor. +inline IntArrayRef get_size_for_index(const Tensor& tensor, int64_t i) { + if (tensor.is_nested()) { + std::vector tensor_sizes = + NestedTensor_get_sizes(get_nested_tensor_impl(tensor)); + return tensor_sizes[i]; + } else { + return tensor.sizes().slice(1); + } +} + +inline IntArrayRef get_stride_for_index(const Tensor& tensor, int64_t i) { + if (tensor.is_nested()) { + std::vector tensor_strides = + NestedTensor_get_strides(get_nested_tensor_impl(tensor)); + return tensor_strides[i]; + } else { + return tensor.strides().slice(1); + } +} + +inline int64_t get_offset_for_index(const Tensor& tensor, int64_t i) { + if (tensor.is_nested()) { + int64_t* offsets_ptr = get_nested_tensor_impl(tensor) + ->get_storage_offsets() + .data_ptr(); + return offsets_ptr[i]; + + } else { + int64_t offset = tensor.storage_offset(); + return offset + tensor.strides()[0] * i; + } +} +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// Data structures and functions for generically applying a function on a nested +// tensor. +namespace impl { + +template +struct NestedNode { + NestedNode() = delete; + explicit NestedNode(std::vector children) + : _is_leaf(false), _children(std::move(children)) {} + explicit NestedNode(TensorList children) + : _is_leaf(false), _children(children.vec()) {} + explicit NestedNode(T payload) + : _is_leaf(true), _payload(std::move(payload)) {} + NestedNode(const NestedNode&) = delete; + NestedNode& operator=(const NestedNode&) = delete; + NestedNode(NestedNode&&) noexcept = default; + NestedNode& operator=(NestedNode&&) noexcept = default; + ~NestedNode() = default; + inline bool is_leaf() const { + return _is_leaf; + } + inline size_t degree() const { + return _children.size(); + } + inline const std::vector unbind() const { + return _children; + } + inline T children(size_t i) const { + return _children[i]; + } + inline const T& payload() const { + return _payload; + } + inline T& payload() { + return _payload; + } + + private: + bool _is_leaf; + std::vector _children; + T _payload{}; +}; + +using TensorNode = NestedNode; + +template +class _map; + +template +class _map> { + public: + static A function_one(const F& fn, const Args&... nested_node) { + return fn(nested_node...); + } + static NestedNode function( + const F& fn, + const NestedNode&... nested_node) { + size_t degree = 0; + bool all_leaf = true; + c10::guts::tuple_map( + std::forward_as_tuple(nested_node...), [&all_leaf, °ree](auto n) { + all_leaf = all_leaf && (n.is_leaf()); + if (degree > 1 && n.degree() > 1) { + TORCH_CHECK( + degree == n.degree(), "NestedNodes must match in degree."); + } + if (n.degree() > degree) { + degree = n.degree(); + } + return nullptr; + }); + // All NestedNodes just wrap regular objects. + if (all_leaf) { + return NestedNode(std::forward(fn)(nested_node.payload()...)); + } + // Some NestedNodes wrap regular Tensors, some NestedTensors and some other + // types. + std::vector result; + for (size_t i = 0; i < degree; i++) { + auto children = c10::guts::tuple_map( + std::forward_as_tuple(nested_node...), [&i](auto a) { + static_assert( + c10::guts::is_instantiation_of::value, + "Internal error."); + // Broadcast regular arguments across NestedTensor constituents. + // This could be a Tensor, integer or anything else really. + if (a.is_leaf()) { + return a.payload(); + } + // Broadcast NestedTensors with one constituent. + if (a.degree() == 1 && !a.is_leaf()) { + return a.children(0); + } + TORCH_CHECK(a.degree() > 0, "Internal assert."); + return a.children(i); + }); + std::apply( + [&result, &fn](Args... filtered) { + result.emplace_back(function_one(fn, filtered...)); + }, + std::move(children)); + } + return NestedNode(std::move(result)); + } +}; + +// TODO: Add static assert to verify lambda arguments match nested_node types +template +static inline NestedNode< + typename c10::guts::infer_function_traits::type::return_type> +map(F&& fn, const NestedNode&... nested_node) { + return _map< + F, + typename c10::guts::infer_function_traits::type::return_type, + typename c10::guts::infer_function_traits::type::parameter_types>:: + function(std::forward(fn), nested_node...); +} + +inline TensorNode get_nested_tensor_structure(at::Tensor tensor) { + if (get_nested_tensor_impl_or_null(tensor) == nullptr) { + return TensorNode(std::move(tensor)); + } + return TensorNode(tensor.unbind()); +} + +inline Tensor wrap_tensor_node( + TensorNode tensor_node, + std::optional dtype, + std::optional layout, + std::optional device, + std::optional pin_memory) { + TORCH_CHECK( + !tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors."); + TensorOptions options_ = + TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( + pin_memory); + if (tensor_node.degree() == 0) { + return wrap_buffer(ones({0}, dtype, layout, device), ones({})); + } + + // Fast path: if all tensors are on CPU, have contiguous memory, and the same + // dtype, copying can be done much faster. + bool all_tensors_cpu = true; + bool all_tensors_contiguous = true; + bool all_tensors_same_dtype = true; + auto first_dtype = tensor_node.children(0).dtype(); + std::vector start_offsets(tensor_node.degree()); + start_offsets[0] = 0; + long total_size = 0; + for (const auto i : c10::irange(tensor_node.degree())) { + all_tensors_cpu = all_tensors_cpu && tensor_node.children(i).is_cpu(); + all_tensors_contiguous = + all_tensors_contiguous && tensor_node.children(i).is_contiguous(); + all_tensors_same_dtype = all_tensors_same_dtype && + (first_dtype == tensor_node.children(i).dtype()); + if (!(all_tensors_cpu && all_tensors_contiguous && + all_tensors_same_dtype)) { + break; + } + if (i > 0) { + start_offsets[i] = + start_offsets[i - 1] + tensor_node.children(i - 1).numel(); + } + total_size += tensor_node.children(i).numel(); + } + + TensorOptions options; + Tensor nt_buffer, nt_sizes; + if (all_tensors_cpu && all_tensors_contiguous && all_tensors_same_dtype) { + nt_buffer = at::empty({total_size}, tensor_node.children(0).options()); + nt_sizes = at::empty( + {static_cast(tensor_node.degree()), + static_cast(tensor_node.children(0).sizes().size())}, + TensorOptions().dtype(kLong)); + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( + at::ScalarType::Half, + at::ScalarType::Bool, + at::ScalarType::BFloat16, + c10::typeMetaToScalarType(first_dtype), + "create_nt_buffer", + [&]() { + at::parallel_for( + 0, tensor_node.degree(), 1, [&](int64_t begin, int64_t end) { + for (int64_t i = begin; i < end; ++i) { + // Only try copying memory if there is more than 0 elements + // for a certain tensor + if (tensor_node.children(i).numel() > 0) { + memcpy( + nt_buffer.mutable_data_ptr() + start_offsets[i], + tensor_node.children(i).const_data_ptr(), + tensor_node.children(i).numel() * sizeof(scalar_t)); + } + } + }); + }); + long sizes_offset = 0; + for (size_t i = 0; i < tensor_node.degree(); ++i) { + auto tensor_sizes = tensor_node.children(i).sizes(); + for (int64_t tensor_size : tensor_sizes) { + nt_sizes.mutable_data_ptr()[sizes_offset++] = tensor_size; + } + } + options = nt_buffer.options().merge_in(options_); + } else { // Slow path + std::vector flat_tensors; + std::vector sizes; + for (const auto i : c10::irange(tensor_node.degree())) { + flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous()); + sizes.push_back( + tensor(c10::IntArrayRef(tensor_node.children(i).sizes()))); + } + options = flat_tensors[0].options().merge_in(options_); + nt_buffer = at::cat(flat_tensors); + nt_sizes = at::native::stack(sizes); + } + + return wrap_buffer(nt_buffer.to(options), nt_sizes); +} + +} // namespace impl + +// This function is meant to ease rapid operator coverage for +// NestedTensor kernels. It is not meant to be efficient. Use it judiciously. +template +inline at::Tensor map_nested_tensor(F&& fn, A... a) { + return wrap_tensor_node( + impl::map(std::forward(fn), impl::get_nested_tensor_structure(a)...), + std::nullopt, + std::nullopt, + std::nullopt, + std::nullopt); +} + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h new file mode 100644 index 0000000000000000000000000000000000000000..0259b4e9f9fd3b2019534ac8c279a5deefa22a6b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h @@ -0,0 +1,133 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +TORCH_API Tensor& quantize_tensor_per_tensor_affine( + const Tensor& rtensor, + Tensor& qtensor, + double scale, + int64_t zero_point); +TORCH_API Tensor& quantize_tensor_per_channel_affine( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + Tensor zero_points, + int64_t axis); + +TORCH_API Tensor& quantize_tensor_per_channel_float_qparams( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +TORCH_API Tensor& dequantize_tensor_per_tensor_affine( + const Tensor& qtensor, + Tensor& rtensor, + double scale, + int64_t zero_point); +TORCH_API Tensor& dequantize_tensor_per_channel_affine( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + Tensor zero_points, + int64_t axis); +TORCH_API Tensor& dequantize_tensor_per_channel_float_qparams( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point); + +using quantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point); + +using dequantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point); + +using dequantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point); + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_fn, + quantize_tensor_per_tensor_affine_stub) +DECLARE_DISPATCH( + quantize_tensor_per_channel_affine_fn, + quantize_tensor_per_channel_affine_stub) +DECLARE_DISPATCH( + quantize_tensor_per_channel_float_qparams_fn, + quantize_tensor_per_channel_float_qparams_stub) + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_fn, + dequantize_tensor_per_tensor_affine_stub) +DECLARE_DISPATCH( + dequantize_tensor_per_channel_affine_fn, + dequantize_tensor_per_channel_affine_stub) +DECLARE_DISPATCH( + dequantize_tensor_per_channel_float_qparams_fn, + dequantize_tensor_per_channel_float_qparams_stub) + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_sub_byte_fn, + quantize_tensor_per_tensor_affine_sub_byte_stub) + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_sub_byte_fn, + dequantize_tensor_per_tensor_affine_sub_byte_stub) + +template +TORCH_API Tensor quantize_tensor( + Tensor rtensor, + Tensor qtensor, + double scale, + int64_t zero_point); +template +TORCH_API Tensor dequantize_tensor( + Tensor qtensor, + Tensor rtensor, + double scale, + int64_t zero_point); + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h new file mode 100644 index 0000000000000000000000000000000000000000..4f6c4e4ec47cb3511fbf34a7da353c67a977e685 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +// Quantize a float value into a uint value given scale and zero_point +template +TORCH_API T quantize_val(double scale, int64_t zero_point, float value); +// TODO combine this with quantize_val once the numerics for ARM are aligned +// with it +template +T quantize_val_arm( + const float scale, + const int32_t zero_point, + const float value); +template +void quantize_vec( + double scale, + int64_t zero_point, + const float* src, + T* dst, + size_t count = 8); +template +TORCH_API float dequantize_val(double scale, int64_t zero_point, T value); +template +TORCH_API float dequantize_vec( + double scale, + int64_t zero_point, + const T* src, + float* dst, + size_t count = 8); +template +TORCH_API DST_T requantize_val(double /*src_scale*/, int64_t /*src_zero_point*/, double /*dst_scale*/, int64_t /*dst_zero_point*/, SRC_T src); + +// Given a multiplier and a zero_point, requantize int32_t computed values back +// to quantized values. See comment above +// make_per_tensor_affine_quantizer function for the usage of int64_t +template +TORCH_API DST_T +requantize_from_int(double multiplier, int64_t zero_point, int64_t src); + +int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax); + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..b80cd4458b2cfd48d34f02caa6e73e95fe13aba4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/ConvUtils.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native::quantized { +namespace { +// MakeConvOutputShape used from both CPU and CUDA libraries +// and exporting symbol from torch_cpu would probably take more storage +// than duplicating implementation which likely be inlined away +template +at::SmallVector MakeConvOutputShape( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const torch::List& stride, + const torch::List& padding, + const torch::List& dilation); + +#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK) +template <> +at::SmallVector MakeConvOutputShape<2>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const at::List& dilation) { + const int H = input_image_shape[0]; + const int W = input_image_shape[1]; + const int64_t Y_H = + (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_W = + (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + return {N, M, Y_H, Y_W}; +} + +template <> +at::SmallVector MakeConvOutputShape<3>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const torch::List& dilation) { + const int D = input_image_shape[0]; + const int H = input_image_shape[1]; + const int W = input_image_shape[2]; + const int64_t Y_D = + (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_H = + (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + const int64_t Y_W = + (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1; + return {N, M, Y_D, Y_H, Y_W}; +} + +#endif +} // anonymous namespace +} // namespace at::native::quantized + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/Copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..292f10e3839b4093308066f1cd4f37ca3eb30468 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/Copy.h @@ -0,0 +1,13 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native { + +Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src); +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h new file mode 100644 index 0000000000000000000000000000000000000000..a8d904e848059f82dea19ac81bb53b08e201eee6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at { + +struct TensorIterator; + +namespace native { + +using fake_quant_tensor_cachemask_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + float sc, + int64_t z_point, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_tensor_cachemask_tensor_qparams_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + const Tensor& sc, + const Tensor& z_point, + const Tensor& fake_quant_enabled, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_learnable_grad_tensor_fn = void (*)( + TensorIterator& iter, + float scale, + float inv_scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + float grad_factor); + +DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub) +DECLARE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_fn, fake_quant_tensor_cachemask_tensor_qparams_stub) +DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub) + +using fake_quant_per_channel_fn = void (*)( + TensorIterator &iter, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_per_channel_cachemask_fn = void (*)( + TensorIterator &iter, + TensorIterator &iter_mask, + int64_t quant_min, + int64_t quant_max); + +DECLARE_DISPATCH(fake_quant_per_channel_cachemask_fn, fake_quant_per_channel_cachemask_stub) + +using fake_quant_learnable_per_channel_fn = void (*)( + TensorIterator &iter, + int64_t quant_min, + int64_t quant_max, + float grad_factor); + +DECLARE_DISPATCH(fake_quant_learnable_per_channel_fn, fake_quant_grad_learnable_channel_stub) + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ca6d5c92b47660519c64dd78e59ec539d42b9b65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/IndexKernel.h @@ -0,0 +1,18 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { +using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point); +using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point); + +DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub) +DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub) + + +} // at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/PackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..6f47b5d67212f36cfa96aea8260acb5aafa4da09 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/PackedParams.h @@ -0,0 +1,139 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + + // out variant of LinearPackedParamsBase::apply + virtual at::Tensor& apply_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + TORCH_CHECK(false, "apply_out is not implemented for this packed parameter type"); + return output; + } + + virtual at::Tensor& apply_relu_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + TORCH_CHECK(false, "apply_relu_out is not implemented for this packed parameter type"); + return output; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32): + // input -> q* -> dq* -> linear* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // X: float32 Tensor, will be quantized to quint8 in the op + // W_prepack: packed qint8 quantized weight and bias + // Returns: + // Y: float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + TORCH_CHECK(false, "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed parameter type"); + return {}; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32): + // input -> q* -> dq* -> linear* -> relu* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // input: float32 Tensor, will be quantized to quint8 in the op + // Returns: + // float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + TORCH_CHECK(false, "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed parameter type"); + return {}; + } + + virtual at::Tensor apply_dynamic( + at::Tensor input, + bool reduce_range = false) = 0; + virtual at::Tensor apply_dynamic_relu( + at::Tensor input, + bool reduce_range = false) = 0; + + virtual at::Tensor& apply_dynamic_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + TORCH_CHECK(false, "apply_dynamic_out is not implemented for this packed parameter type"); + return output; + } + virtual at::Tensor& apply_dynamic_relu_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + TORCH_CHECK(false, "apply_dynamic_relu_out is not implemented for this packed parameter type"); + return output; + } + + virtual std::tuple> unpack() = 0; + + virtual std::optional bias() = 0; + + virtual void set_bias(std::optional /*bias*/) { + TORCH_CHECK(false, "set_bias is not implemented for this packed parameter type"); + } +}; + +template +struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) = 0; + + virtual std::tuple> unpack() = 0; + + virtual torch::List stride() const = 0; + virtual torch::List padding() const = 0; + virtual torch::List output_padding() const = 0; + virtual torch::List dilation() const = 0; + virtual int64_t groups() const = 0; + virtual bool transpose() const = 0; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/ACLUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/ACLUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..fe4c17a6bc88f2bb7dea2c015cb19aa6c4502e26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/ACLUtils.h @@ -0,0 +1,262 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if AT_MKLDNN_ACL_ENABLED() + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Utilities for Arm Compute Library (ACL) quantized operations +// Provides interfaces to leverage ACL's accelerated kernels for statically and +// dynamically quantized matmuls (i.e. qlinear and qlinear_dynamic) These are +// utalized through PackedLinearWeightsACL which extends +// PackedLinearWeightsOnednn Note that PackedLinearWeightsACL extends rather +// than replaces PackedLinearWeightsOnednn for AArch64 because ACL currently +// only supports per_tensor weight quantization. +namespace at::native::acl_utils { + +using QuantMatmulCacheKey = std::tuple< + int64_t, // M + bool, // FUSE_RELU + int64_t, // NUM_THREADS + double, // INPUT_SCALE + int64_t, // INPUT_OFFSET + double, // OUTPUT_SCALE + int64_t, // OUTPUT_OFFSET + bool // SIGNED_INPUT + >; + +enum class QuantMatmulCacheKeyIndex { + M, + FUSE_RELU, + NUM_THREADS, + INPUT_SCALE, + INPUT_OFFSET, + OUTPUT_SCALE, + OUTPUT_OFFSET, + SIGNED_INPUT +}; + +// Abstract interface to share common stuff between static/dynamic ACL matmuls. +struct QuantMatmul { + arm_compute::NEGEMMLowpMatrixMultiplyCore gemm; + // key for use in the cache + QuantMatmulCacheKey key; + + QuantMatmul( + int64_t weight_dim_0, + int64_t weight_dim_1, + double weight_scale, + int64_t weight_offset, + int8_t* weight_ptr, + std::optional bias_ptr, + const QuantMatmulCacheKey& cache_key); + + virtual ~QuantMatmul(); + virtual arm_compute::Status validate() = 0; + virtual void configure() = 0; + + protected: + arm_compute::Tensor wei_q_tensor_; + std::optional bia_tensor_; + arm_compute::GEMMInfo gemm_info_; + std::optional relu_info_; +}; + +struct DynamicQuantMatmul : public QuantMatmul { + arm_compute::Tensor src_q_tensor; + arm_compute::Tensor src_tensor; + arm_compute::Tensor dst_tensor; + arm_compute::NEQuantizationLayer quant; + // We need a ReLU layer here (unlike static quantization) because the ReLU + // cannot be "truly" fused with the GEMM through gemm_info in ACL dynamically + // quantized matmuls. + std::optional relu; + + DynamicQuantMatmul( + int64_t weight_dim_0, + int64_t weight_dim_1, + double weight_scale, + int64_t weight_offset, + int8_t* weight_ptr, + std::optional bias_ptr, + const QuantMatmulCacheKey& cache_key); + + ~DynamicQuantMatmul() override; + + arm_compute::Status validate() override; + void configure() override; + + private: + at::Tensor src_q_tensor_orig_; +}; + +struct StaticQuantMatmul : public QuantMatmul { + arm_compute::Tensor src_q_tensor; + arm_compute::Tensor dst_q_tensor; + + StaticQuantMatmul( + int64_t weight_dim_0, + int64_t weight_dim_1, + double weight_scale, + int64_t weight_offset, + int8_t* weight_ptr, + std::optional bias_ptr, + const QuantMatmulCacheKey& cache_key); + + ~StaticQuantMatmul() override; + + arm_compute::Status validate() override; + void configure() override; + + private: + std::optional bia_q_tensor_; + std::optional bia_q_tensor_orig_; +}; + +struct QuantAdd { + arm_compute::Tensor qa_tensor; + arm_compute::Tensor qb_tensor; + arm_compute::Tensor qdst_tensor; + arm_compute::NEArithmeticAddition q_add; + + QuantAdd( + arm_compute::DataType dtype, + const std::vector& input_dims, + double qa_scale, + int64_t qa_offset, + double qb_scale, + int64_t qb_offset, + double dst_scale, + int64_t dst_offset); + + arm_compute::Status validate(); + void configure(); + + private: + arm_compute::ConvertPolicy policy{arm_compute::ConvertPolicy::SATURATE}; +}; + +} // namespace at::native::acl_utils +struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn { + using ACLQuantMatmul = at::native::acl_utils::QuantMatmul; + using ACLDynamicQuantMatmul = at::native::acl_utils::DynamicQuantMatmul; + using ACLStaticQuantMatmul = at::native::acl_utils::StaticQuantMatmul; + using ACLQuantMatmulCacheKey = at::native::acl_utils::QuantMatmulCacheKey; + using ACLQuantMatmulCacheKeyIndex = + at::native::acl_utils::QuantMatmulCacheKeyIndex; + + PackedLinearWeightsACL( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias); + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + template + std::shared_ptr get_acl_quant_matmul( + const ACLQuantMatmulCacheKey& key) { + return std::dynamic_pointer_cast( + fetch_or_create_acl_quant_matmul(key)); + } + + private: + int64_t k_; + int64_t n_; + int64_t weight_zero_point_; + double weight_scale_; + + // A 2 element (per layer) cache. Given it's not intended to store more than 2 + // elements, we do not need a fancy implementation. The idea behind it is to + // allow for a (configuration free) fast path for autoregressive + // transformer-like models which usually involve 2 input tensor shapes; one + // for the prefill phase and another for the autoregressive phase + std::array, 2> cache_; + + template + std::shared_ptr fetch_or_create_acl_quant_matmul( + const ACLQuantMatmulCacheKey& key) { + // We're only maintaining a 2 element LRU cache + // hit first + if (cache_[0] != nullptr && cache_[0]->key == key) { + return cache_[0]; + } + // hit second + if (cache_[1] != nullptr && cache_[1]->key == key) { + // Update LRU + std::swap(cache_[0], cache_[1]); + return cache_[0]; + } + // miss -> replace Least Recently Used - i.e. element at index 1 + cache_[1] = create_acl_quant_matmul(key); + std::swap(cache_[0], cache_[1]); + return cache_[0]; + } + + template + std::shared_ptr create_acl_quant_matmul( + const ACLQuantMatmulCacheKey& key) { + std::optional bias_ptr; + if (bias_.has_value()) { + bias_ptr = (float*)bias_.value().get_data_handle(); + } + auto acl_gemm = std::make_shared( + k_, + n_, + weight_scale_, + weight_zero_point_, + (int8_t*)weight_.get()->get_data_handle(), + bias_ptr, + key); + + // validate + auto status = acl_gemm->validate(); + if (status.error_code() != arm_compute::ErrorCode::OK) { + TORCH_WARN( + "Arm Compute Library's Quantized Matmul Validation Failed: " + + status.error_description()); + return nullptr; + } + + // configure + acl_gemm->configure(); + return acl_gemm; + } + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false); + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); +}; + +#endif // AT_MKLDNN_ACL_ENABLED() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..a27322debae0ebfe98adac788080b5db451e4aa8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h @@ -0,0 +1,11 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#include + +namespace at::native { +TORCH_API Tensor +quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point); +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..30e18dde45a889df235adbf337cc642d89678c4c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor unpack() = 0; + + virtual int64_t bit_rate() const = 0; + virtual int64_t version() const = 0; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..5215485ef744b4f905c5b6f0ac48fcc7f1c33514 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h @@ -0,0 +1,506 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#if AT_MKLDNN_ENABLED() +#include +#include +#include +#if !defined(__powerpc__) +#include +#endif + +#include + +using PrimitiveCacheKey = std::tuple< + double, // input_scale + int64_t, // input_zero_point + std::vector, // input_shape + double, // output_scale + int64_t, // output_zero_point + int64_t, // OMP_number_of_threads + double, // accum_scale + int64_t>; // accum_zero_point + +enum CacheKeyIndex { + InputScale, + InputZeroPoint, + InputShape, + OutputScale, + OutputZeroPoint, + NumOfThreads, +}; + +// Base class of primitive cache +struct PrimitiveCache { + PrimitiveCacheKey key; + + bool hit(const PrimitiveCacheKey& key) { + return this->key == key; + } +}; + +using LinearParams = ideep::matmul_forward_params; +using Conv = dnnl::convolution_forward; +using ConvDesc = dnnl::convolution_forward::primitive_desc; +using ConvParams = ideep::convolution_forward_params; +using Deconv = dnnl::deconvolution_forward; +using DeconvDesc = dnnl::deconvolution_forward::primitive_desc; +using DeconvParams = ideep::deconv_forward_params; + +struct LinearPrimitiveCache : PrimitiveCache { + LinearPrimitiveCache() = default; + + LinearPrimitiveCache( + const PrimitiveCacheKey& key, + const LinearParams& param) { + this->key = key; + this->param = param; + } + + LinearParams param; + + // For dynamic qlinear, scale and zero point + // are set at execution time. So we only need to compare + // the rest part of key. + bool hit_dynamic(const PrimitiveCacheKey& new_key) { + auto const& cached_input_shape = std::get(this->key); + auto const& new_input_shape = std::get(new_key); + return ( + cached_input_shape == new_input_shape && + std::get(this->key) == std::get(new_key)); + } + + LinearParams& get_param() { + return param; + } +}; + +struct ConvPrimitiveCache : PrimitiveCache { + ConvPrimitiveCache() = default; + + ConvPrimitiveCache( + const PrimitiveCacheKey& key, + const ConvParams& params) { + this->key = key; + this->params = params; + } + + ConvParams params; + + ConvParams& get_params() { + return params; + } +}; + +struct DeconvPrimitiveCache : PrimitiveCache { + DeconvPrimitiveCache() = default; + + DeconvPrimitiveCache( + const PrimitiveCacheKey& key, + const DeconvParams& params) { + this->key = key; + this->params = params; + } + + DeconvParams params; + + DeconvParams& get_params() { + return params; + } +}; + +enum PostOps { + NoPostOp, + Relu, + LeakyRelu, + Tanh, + Gelu +}; + + +struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { + PackedLinearWeightsOnednn( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)) { + cache_initialized_flag = std::make_unique(); + } + std::unique_ptr weight_; + std::optional bias_; + at::Tensor orig_weight_; + std::optional orig_bias_; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + at::Tensor apply_leaky_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point, + double negative_slope); + + at::Tensor apply_tanh( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + std::tuple> unpack() override; + + std::optional bias() override { + return orig_bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + LinearPrimitiveCache prim_cache; + std::unique_ptr cache_initialized_flag; + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point, + torch::List post_op_args = torch::List()); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false); + + LinearPrimitiveCache& get_cache() { + return prim_cache; + } +}; + +template +struct PackedConvWeightsOnednn : public ConvPackedParamsBase { + PackedConvWeightsOnednn( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose) { + cache_initialized_flag = std::make_unique(); + } + + std::unique_ptr weight_; + std::optional bias_; + at::Tensor orig_weight_; + std::optional orig_bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + at::Tensor apply_add( + const at::Tensor& input, + const at::Tensor& accum, + double output_scale, + int64_t output_zero_point); + + at::Tensor apply_add_relu( + const at::Tensor& input, + const at::Tensor& accum, + double output_scale, + int64_t output_zero_point); + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + ConvPrimitiveCache conv_prim_cache; + DeconvPrimitiveCache deconv_prim_cache; + std::unique_ptr cache_initialized_flag; + + template + at::Tensor apply_impl( + const at::Tensor& input, + const std::optional& accum, + double output_scale, + int64_t output_zero_point); + + ConvPrimitiveCache& get_conv_cache() { + assert(!transpose()); + return conv_prim_cache; + } + + DeconvPrimitiveCache& get_deconv_cache() { + assert(transpose()); + return deconv_prim_cache; + } +}; + +namespace onednn_utils { + +inline ideep::attr_t create_attr_by_post_op( + const std::string_view& binary_post_op, + double binary_alpha, + double input1_scale, + int64_t input1_zero_point, + const ideep::tensor::desc& input1_desc, + const std::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const std::string_view& unary_post_op_algorithm) { + using ideep::tensor; + if (binary_post_op == "none") { + if (unary_post_op == "relu") { + return ideep::attr_t::fuse_relu(); + } else if (unary_post_op == "leaky_relu") { + TORCH_CHECK( + unary_post_op_args.size() == 1, + "onednn qlinear: expect one argument for post op leaky_relu but got ", unary_post_op_args.size(), " args"); + auto alpha = unary_post_op_args[0].value().to(); + return ideep::attr_t::fuse_relu_v2(alpha); + } else if (unary_post_op == "tanh") { + return ideep::attr_t::fuse_tanh(); + } else if (unary_post_op == "gelu") { + TORCH_CHECK( + unary_post_op_algorithm == "none" || unary_post_op_algorithm == "tanh", + "onednn qlinear: algorithm for post op gelu must be none or tanh but got ", unary_post_op_algorithm); + auto post_algorithm = unary_post_op_algorithm == "none" ? + dnnl::algorithm::eltwise_gelu_erf : + dnnl::algorithm::eltwise_gelu_tanh; + return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm); + } else if (unary_post_op == "hardtanh") { + TORCH_CHECK( + unary_post_op_args.size() == 2 && + unary_post_op_args[0].has_value() && + unary_post_op_args[1].has_value(), + "hardtanh is expected to have two scalar input: min_val and max_val"); + auto lower_bound_value = + unary_post_op_args[0].value().to(); + auto upper_bound_value = + unary_post_op_args[1].value().to(); + return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value); + } else if (unary_post_op == "hardswish") { + return ideep::attr_t::fuse_hardswish(); + } else if (unary_post_op == "swish") { + return ideep::attr_t::fuse_swish(); + } else { + TORCH_CHECK( + unary_post_op == "none", + "onednn qlinear: unsupported unary post op ", unary_post_op); + } + } else if (binary_post_op == "sum") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_sum(input1_scale, input1_zero_point); + } else if (unary_post_op == "relu") { + return ideep::attr_t::residual_with_sum_zero_point(input1_scale, input1_zero_point); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum"); + } + } else if (binary_post_op == "add") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_binary(ideep::algorithm::binary_add, input1_desc); + } else if (unary_post_op == "relu") { + ideep::post_ops po; + po.append_binary(ideep::algorithm::binary_add, input1_desc); + po.append_eltwise(ideep::algorithm::eltwise_relu, 0, 0); + return ideep::attr_t::attr_post_ops(po); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op add"); + } + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported binary post op ", binary_post_op); + } + return ideep::attr_t(); +} + +// ONEDNN requires symmetric quantization of weight +// Use this util function to check. +inline bool is_weight_symmetric_quant( + const at::Tensor& weight, + bool is_transposed_conv) { + bool is_symmetric = true; + const auto qtype = weight.qscheme(); + if (qtype == c10::kPerTensorAffine) { + is_symmetric &= (weight.q_zero_point() == 0); + } else if (qtype == c10::kPerChannelAffine) { + if (is_transposed_conv) { + // This case is currently not supported in PyTorch + // but we do not want to raise an error in this util function. + is_symmetric = false; + } else { + auto output_channels = weight.size(0); + for (int i = 0; i < output_channels; ++i) { + auto zp = weight.q_per_channel_zero_points()[i].item(); + is_symmetric &= (zp == 0); + } + } + } else { + // This case is currently not supported in PyTorch + // but we do not want to raise an error in this util function. + is_symmetric = false; + } + return is_symmetric; +} + +// When qengine is x86, use this util func to check if onednn kernel +// is preferred than fbgemm's to get better performance. +inline bool should_use_onednn_quant( + const at::Tensor& weight, + bool is_transposed_conv, + int groups, + torch::List output_padding) { + // Performance of onednn is only validated on Linux right now. + // Also, the heuristics for dispatching are based on perf data on Linux. + // So, for x86 qengine, we always use fbgemm kernels if OS is not Linux. + // TODO Support more OSs. +#if !defined(__linux__) + return false; +#else +#if defined(__powerpc__) + constexpr auto vnni_available = true; +#else + const auto vnni_available = cpuinfo_has_x86_avx512vnni(); +#endif + bool w_sym_quant = + is_weight_symmetric_quant(weight, is_transposed_conv); + bool opad_all_zero = + std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; }); + return vnni_available && (groups <= 100) && w_sym_quant && opad_all_zero; +#endif +} + +} // onednn_utils + +at::Tensor _qconv_prepack_onednn( + at::Tensor weight, // from CPU backend instead of QuantizedCPU + at::Tensor weight_scales, // Weight zero points must be 0 for onednn + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape=std::nullopt); + +#define FP8E4M3_MAX 448.0 + +#define CACHE_ONEDNN_CONTEXT_FLAG "ONEDNN_CACHE_CONTEXT_UNSAFE" + +struct QlinearForwardParams { + dnnl::matmul primitive; + ideep::exec_args args; + ideep::tensor packed_weight; + ideep::tensor weight_scales; + std::optional src_scale; + std::optional src_zero_point; + std::optional dst_scale; + std::optional dst_zero_point; + std::optional bias; + ideep::tensor scratchpad; + + void init_args() { + args.insert({DNNL_ARG_WEIGHTS, packed_weight}); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); + if (bias.has_value()) { + args.insert({DNNL_ARG_BIAS, bias.value()}); + } + if (src_scale.has_value()) { + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scale.value()}); + } + if (dst_scale.has_value()) { + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scale.value()}); + } + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, weight_scales}); + if (src_zero_point.has_value()) { + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zero_point.value()}); + } + if (dst_zero_point.has_value()) { + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zero_point.value()}); + } + } +}; + +#endif // #if AT_MKLDNN_ENABLED() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..f9f887f04e3c3f80e568f005d1d244f55af3ce60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h @@ -0,0 +1,513 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_PYTORCH_QNNPACK +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +inline int kPaddingChannels = 8; +struct QnnpackOperatorDeleter { + void operator()(pytorch_qnnp_operator_t op) { + pytorch_qnnp_delete_operator(op); + } +}; + +// PackedWeight struct for QNNPACK stores the original Weight and Bias as +// QNNPACK currently does not support an unpack function. +// For PyTorch Mobile, once the model is scripted and serialized we don't need +// to call unpack, so we can save some memory by checking for this case and free +// the original weights after packing. +// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized +// with input scale which is available at runtime in pytorch. During runtime if +// input scale value changes then we requantize bias with the updated scale. For +// inference we expect the graph to be static so the input scale should not +// change across consecutive inference calls. +struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { + PackedLinearWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + std::optional input_scale, + at::Tensor w_scales, + std::vector&& w_zps) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias_(at::native::mobile::allocate_padded_contiguous_if_needed( + bias, bias.suggest_memory_format())), + per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine), + input_scale(std::move(input_scale)), + w_scales(std::move(w_scales)), + w_zero_points(std::move(w_zps)), + q_scheme(this->orig_weight.qscheme()) { + weight_sizes = this->orig_weight.sizes().vec(); + } + + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias_; + bool per_channel_; + std::optional input_scale; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + std::vector weight_sizes; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + bool per_channel() const { + return per_channel_; + } + + private: + std::mutex qnnp_mutex_; + +#ifdef USE_XNNPACK + xnnpack_operator xnnp_linear_op; + + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range); +}; + +template +struct PackedConvWeightsQnnp : public ConvPackedParamsBase { + PackedConvWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + std::optional input_scale, + std::vector kernel, + at::Tensor w_scale, + std::vector&& w_zps, + bool is_per_channel) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + is_per_channel_(is_per_channel), + input_scale(input_scale), + kernel_(std::move(kernel)), + w_scales(std::move(w_scale)), + w_zero_points(std::move(w_zps)) { + const bool any_padding = std::any_of( + padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; }); + const size_t kernel_size = + std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>()); + + const size_t group_input_channels = transpose + ? this->orig_weight.size(0) / groups + : this->orig_weight.size(1); + const size_t group_output_channels = transpose + ? this->orig_weight.size(1) + : this->orig_weight.size(0) / groups; + + const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1; + const size_t kernel_height = kernel_[kSpatialDim - 2]; + const size_t kernel_width = kernel_[kSpatialDim - 1]; + + pytorch_qnnp_ukernel_type ukernel_type; + if (transpose_) { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_none; + + const bool has_depthwise_dimensions = + (kSpatialDim == 2 && + ((kernel_height == 3 && kernel_width == 3) || + (kernel_height == 5 && kernel_width == 5))) || + (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 && + kernel_depth == 3); + const bool has_depthwise_grouping = + group_input_channels == 1 && group_output_channels == 1 && groups > 1; + + if (has_depthwise_dimensions && has_depthwise_grouping) { + ukernel_type = pytorch_qnnp_ukernel_type_dwconv; + } else if ( + kernel_size == 1 && + std::all_of( + stride_.begin(), + stride_.end(), + [](const auto& e) { return e == 1; }) && + !any_padding) { + ukernel_type = group_input_channels >= SIZE_MAX + ? pytorch_qnnp_ukernel_type_xzp_gemm + : pytorch_qnnp_ukernel_type_gemm; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } + } + + if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) { + TORCH_INTERNAL_ASSERT( + false, "Per channel quantized weights are not supported for XZP kernels"); + } + + pytorch_qnnp_operator_t convolution{nullptr}; + // Initially all the params are set to zero. + convolution = static_cast( + calloc(1, sizeof(struct pytorch_qnnp_operator))); + if (convolution == nullptr) { + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure", + sizeof(struct pytorch_qnnp_operator)); + } + + convolution_op = + std::unique_ptr( + convolution); + + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + convolution->ukernel_type = ukernel_type; + convolution->groups = groups; + convolution->group_input_channels = group_input_channels; + convolution->group_output_channels = group_output_channels; + convolution->kernel_depth = kernel_depth; + convolution->kernel_height = kernel_height; + convolution->kernel_width = kernel_width; + convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1; + convolution->stride_height = stride_[kSpatialDim - 2]; + convolution->stride_width = stride_[kSpatialDim - 1]; + convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1; + convolution->dilation_height = dilation_[kSpatialDim - 2]; + convolution->dilation_width = dilation_[kSpatialDim - 1]; + convolution->input_padding_height = padding_[kSpatialDim - 2]; + convolution->input_padding_width = padding_[kSpatialDim - 1]; + convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0; + convolution->per_channel = is_per_channel_; + convolution->transpose = transpose_; + + const uint32_t kr = pytorch_qnnp_params.q8conv.kr; + const size_t k_stride = (group_input_channels + (kr - 1)) & -kr; + + size_t zero_size = sizeof(uint8_t) * k_stride; + size_t zero_offset = 0; + + if (transpose_) { + convolution->adjustment_width = output_padding_[1]; + convolution->adjustment_height = output_padding_[0]; + if (group_input_channels < 8) { + zero_size += 8; + zero_offset = 8; + } + } else { + zero_buffer_size = 0; + if (any_padding) { + zero_size = 0; + zero_offset = 0; + if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) { + const uint32_t cr = pytorch_qnnp_params.q8dw9.cr; + const size_t group_stride = (groups + (cr - 1)) & -cr; + if (groups >= 8) { + zero_size = sizeof(uint8_t) * group_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * group_stride + 8; + zero_offset = sizeof(uint8_t) * 8; + } + } else if ( + ukernel_type == pytorch_qnnp_ukernel_type_conv || + ukernel_type == pytorch_qnnp_ukernel_type_gemm) { + if (group_input_channels >= 8) { + zero_size = sizeof(uint8_t) * k_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * k_stride + 8; + zero_offset = 8; + } + } + } + } + + // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI) + void* zero_buffer = malloc(zero_size); + if (zero_buffer == nullptr) { + pytorch_qnnp_delete_operator(convolution); + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for zero padding", + zero_size); + } + // Need to set to input zero point + // memset(zero_buffer, input_zero_point, zero_size); + zero_buffer_size = zero_size; + convolution->zero_buffer = zero_buffer; + convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset); + } + + std::unique_ptr convolution_op; + #ifdef USE_XNNPACK + xnnpack_operator xnnp_convolution_op; + #endif // USE_XNNPACK + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + bool is_per_channel_; + std::optional input_scale; + std::vector kernel_; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + size_t zero_buffer_size; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range=false) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + bool per_channel() const { + return is_per_channel_; + } + + private: + std::mutex qnnp_mutex_; + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + +#ifdef USE_XNNPACK + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK +}; + +enum class Activation : uint8_t { NONE = 0, RELU = 1 }; + +template +inline T QuantizeValue(float scale, int32_t zero_point, float value) { + const int32_t qmin = std::numeric_limits::min(); + const int32_t qmax = std::numeric_limits::max(); + auto r = zero_point + static_cast(std::nearbyint(value / scale)); + r = std::max(r, qmin); + r = std::min(r, qmax); + return static_cast(r); +} + +template +inline std::pair activationLimits( + float scale, + int32_t zero_point, + Activation Ac) { + switch (Ac) { + case Activation::NONE: + return {std::numeric_limits::min(), + std::numeric_limits::max()}; + case Activation::RELU: + return {QuantizeValue(scale, zero_point, 0.0), + std::numeric_limits::max()}; + default: +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + } +} + +namespace at::native::qnnp_avgpool_helper { +Tensor qnnpack_avg_pool2d( + Tensor input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + std::optional divisor_override); +} // namespace at::native::qnnp_avgpool_helper + +namespace { +[[maybe_unused]] std::vector generate_requantization_scales( + const at::Tensor& weight_scales, + const float input_scale, + const float output_scale, + std::vector& requant_scales) { + // Since weight scale is allocated with padding + // weight_scales.numel() gives us padded num elements. + const auto num_output_channels_padded = weight_scales.numel(); + float *const weight_scales_data = weight_scales.data_ptr(); + if (static_cast(requant_scales.size()) < num_output_channels_padded) { + requant_scales.resize(num_output_channels_padded); + } + for (const auto i : c10::irange(num_output_channels_padded)) { + const auto inverse_output_scale = 1.f /output_scale; + requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale; + TORCH_CHECK( + (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])), + "failed to create op with requantization scale: ", + requant_scales[i], + ": requantization scale must be finite and positive"); + } + return requant_scales; +} + +[[maybe_unused]] std::pair, at::Tensor> +make_zero_points_and_scales_tensor( + const at::Tensor& weight_contig, + bool transpose = false, + uint32_t groups = 1) { + const int out_ch_idx = transpose ? 1 : 0; + const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1); + // Add 8 to account for buffering needed by QNNPACK. + const auto num_output_channels_padded = num_output_channels + kPaddingChannels; + const auto qtype = weight_contig.qscheme(); + std::vector weight_zp(num_output_channels_padded, 0); + // Adjust weight zero point, similar to weight data. + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong, + "Per channel zero points dtype must be long int."); + const int64_t* per_channel_zero_points = + weight_contig.q_per_channel_zero_points().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + at:: Tensor weight_scales = + at::empty( + {num_output_channels_padded}, + at::device(at::kCPU).dtype(at::kFloat)); + float *const weight_scales_data = weight_scales.data_ptr(); + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = weight_contig.q_scale(); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_scales().scalar_type() == at::kDouble, + "Per channel scales dtype must be double."); + const double *const per_channel_scales = + weight_contig.q_per_channel_scales().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = static_cast(per_channel_scales[i]); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) { + weight_scales_data[i] = 1.f; + } + return {weight_zp, weight_scales}; +} +} // namespace + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..02a867a1f95687119fadf4e968ae12a3776d93e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h @@ -0,0 +1,245 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +namespace quant_utils { +namespace { + float RawUint16ToFp16(unsigned short value) { + // Convert raw 16 bits half precision floating point number + // to single precision floating point number. + const unsigned short sign_bits = value >> 15; + const unsigned short exponent_bits = value >> 10 & 0x1f; + const unsigned short significand_bits = value & 0x3ff; + + const float sign = sign_bits ? -1 : 1; + const float significand = + 1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10; + const float exponent = exponent_bits - 0xf; + + return sign * std::ldexp(significand, exponent); +} + +template +bool CheckAndSaturate(T max_val, T* element) { + if (*element > max_val) { + *element = max_val; + return true; + } + if (*element < -max_val) { + *element = -max_val; + return true; + } + return false; +} +} +using namespace std; +// A structure to hold quantization parameters 'scale' and 'zero_point'. +// The meaning of these values is as the constants in the quantization equation +// +// real_value = scale * (quantized_value - zero_point) +// +// In other words, 'zero_point' is the quantized value that corresponds +// to the real value 0, and 'scale' is the difference of real values +// corresponding to consecutive quantized values. +struct TensorQuantizationParams { + double scale; + std::int32_t zero_point; + int precision; +}; + +// Use fp16_min as the small scale cutoff because we don't want to use scales in +// fp16 subnormal range. This is to be consistent with Glow and FakeLowP +// implementation for NNPI. +constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; + +// Following implementation should be identical to fbgemm::ChooseQuantizationParams +inline TensorQuantizationParams ChooseQuantizationParams( + float min, + float max, + int32_t qmin, + int32_t qmax, + bool preserve_sparsity = false, + bool force_scale_power_of_two = false, + bool reduce_range = false) { + TORCH_CHECK( + min <= max, + "In ChooseQuantizationParams, min should be less than or equal to max"); + + if (reduce_range) { + qmin = qmin/2; + qmax = qmax/2; + } + if (min < 0 && max > 0 && preserve_sparsity) { + int symmetric_qmin = -((qmax - qmin) / 2 + 1); + int symmetric_qmax = (qmax - qmin) / 2; + double max_scale = + std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax)); + min = max_scale * symmetric_qmin; + max = max_scale * symmetric_qmax; + } + + // We extend the [min, max] interval to ensure that it contains 0. + // Otherwise, we would not meet the requirement that 0 be an exactly + // representable value. + min = std::min(min, 0.f); + max = std::max(max, 0.f); + + TORCH_CHECK( + qmin < qmax, + "In ChooseQuantizationParams, qmin should be less than qmax"); + + // Use double precision for intermediate computation but use single precision + // in final number to reflect the actual number used during quantization. + double scale = (static_cast(max) - min) / (qmax - qmin); + // If scale is 0 or too small so its reciprocal is infinity, we arbitrary + // adjust the scale to 0.1 . We want to avoid scale's reciprocal being + // infinity because some of fbgemm code pre-computes scale's reciprocal to do + // multiplication instead of division in the time critical part of code. + if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { + scale = 0.1; + } + TORCH_CHECK(scale > 0, "quantization scale should be > 0"); + + if (force_scale_power_of_two) { + if (scale < 1) { + scale = 1.0 / (1 << static_cast(floor(log(1.0 / scale) / log(2)))); + } else { + scale = 1 << static_cast(ceil(log(scale) / log(2))); + } + } + + // Cut off small scale + if (scale < SMALL_SCALE_THRESHOLD) { + float org_scale = scale; + scale = SMALL_SCALE_THRESHOLD; + // Adjust the min and max based on the new scale + if (min == 0.0f) { + max = SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else if (max == 0.0f) { + min = -SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else { + float amplifier = SMALL_SCALE_THRESHOLD / org_scale; + min *= amplifier; + max *= amplifier; + } + } + + // Zero-point computation. + // First the initial floating-point computation. The zero-point can be + // determined from solving an affine equation for any known pair + // (real value, corresponding quantized value). + // We know two such pairs: (rmin, qmin) and (rmax, qmax). + // The arithmetic error on the zero point computed from either pair + // will be roughly machine_epsilon * (sum of absolute values of terms) + // so we want to use the variant that adds the smaller terms. + double zero_point_from_min = qmin - min / scale; + double zero_point_from_max = qmax - max / scale; + double zero_point_from_min_error = + std::abs(qmin) - std::abs(min / scale); + double zero_point_from_max_error = + std::abs(qmax) - std::abs(max / scale); + double initial_zero_point = + zero_point_from_min_error < zero_point_from_max_error + ? zero_point_from_min + : zero_point_from_max; + + // for symmetric quantization (preserve_sparsity == true), we force zero_point + // to be a middle value between qmin and qmax. + // If either min or max is 0, then we just use 0 as zero_point. + if (min < 0 && max > 0 && preserve_sparsity) { + initial_zero_point = static_cast(qmin + qmax) / 2; + } + + // Now we need to nudge the zero point to be an integer + // (our zero points are integer, and this is motivated by the requirement + // to be able to represent the real value "0" exactly as a quantized value, + // which is required in multiple places, for example in Im2col with zero + // padding). + int32_t nudged_zero_point = 0; + if (initial_zero_point < qmin) { + nudged_zero_point = qmin; + } else if (initial_zero_point > qmax) { + nudged_zero_point = qmax; + } else { + nudged_zero_point = nearbyint(initial_zero_point); + } + + TensorQuantizationParams result; + result.scale = scale; + result.zero_point = nudged_zero_point; + return result; +} + +// This function helps to convert the Conv1D dimensions usable by the Conv2d op. +constexpr int64_t kConv1dSqueezeDim = 0; +[[maybe_unused]] static torch::List MakeArgForConv1d( + const torch::List& arg, + int64_t base_value) { + TORCH_CHECK(!arg.empty(), "Argument must have elements."); + torch::List result({arg.get(0), base_value}); + if (arg.size() == 1) { + result[1] = arg.get(0); + } else { + result[1] = arg.get(1); + } + result[kConv1dSqueezeDim] = base_value; + return result; +} + +// The range for using FP16 quantization of weights requires that the elements +// should be in the range of [5.96e-8, 65504]. If it is out of range, then the +// number will be saturated to max or min representable values by FP16. +inline void HandleWeightsSaturation(int64_t N, float* weight) { + const float kFp16Max = RawUint16ToFp16(0x7BFF); + bool found_out_of_range = false; + for (const auto i : c10::irange(N)) { + bool saturate = CheckAndSaturate(kFp16Max, weight + i); + if (saturate) { + found_out_of_range = true; + } + } + if (found_out_of_range) { + TORCH_WARN("FOUND weight out of range "); + } +} + +// Util function for quantizing bias. +inline at::Tensor QuantizeBias( + bool is_per_channel, + const at::Tensor& bias, + const at::Tensor& weight_contig, + double input_scale) { + at::Tensor qbias; + if (is_per_channel) { + auto bias_quant_scales = + weight_contig.q_per_channel_scales() * input_scale; + auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt); + qbias = at::native::quantize_per_channel( + bias, bias_quant_scales, bias_zp, 0, c10::kQInt32); + } else { + qbias = at::native::quantize_per_tensor( + bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32); + } + return qbias; +} + +} // namespace quant_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h new file mode 100644 index 0000000000000000000000000000000000000000..ccbae78b5516ca6b516c281d054327e3d2bfac13 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h @@ -0,0 +1,287 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at::native { + +using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Scalar& /*negval_*/); +using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */); +using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point); +using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qclamp_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& min, + const Scalar& max, + at::Tensor& /*qy*/); +using qclamp_minmax_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& /*min or max*/, + at::Tensor& /*qy*/); +using qthreshold_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& threshold, + const Scalar& value, + at::Tensor& /*qy*/); +using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qelu_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*alpha*/, + const Scalar& /*scale*/, + const Scalar& /*input_scale*/, + at::Tensor& /*qy*/); +using qbinary_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/); +using qadd_scalar_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/); +using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qdropout_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*p*/, + bool training /*training*/, + at::Tensor& /*qy*/); +using qmaxpool_2d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iH, + int64_t iW, // input sizes + int64_t oH, + int64_t oW, // output sizes + int64_t kH, + int64_t kW, // kernel size + int64_t sH, + int64_t sW, // strides + int64_t pH, + int64_t pW, // padding + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qmaxpool_3d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iT, + int64_t iH, + int64_t iW, // input sizes + int64_t oT, + int64_t oH, + int64_t oW, // output sizes + int64_t kT, + int64_t kH, + int64_t kW, // kernel size + int64_t sT, + int64_t sH, + int64_t sW, // strides + int64_t pT, + int64_t pH, + int64_t pW, // padding + int64_t dT, + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qadaptive_avg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideH, + int64_t istrideW); +using qadaptive_avg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeD, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideD, + int64_t istrideH, + int64_t istrideW); +using qavg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool count_include_pad, + std::optional divisor_override); + +using qavg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t inputDepth, + int64_t outputWidth, + int64_t outputHeight, + int64_t outputDepth, + int kW, + int kH, + int kD, + int dW, + int dH, + int dD, + int padW, + int padH, + int padD, + bool count_include_pad, + std::optional divisor_override); + +using qupsample_bilinear2d_fn = void (*)( + Tensor& output, + const Tensor& input, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + int64_t nbatch, + int64_t channels, + bool align_corners, + std::optional scales_h, + std::optional scales_w); + +using qcat_nhwc_fn = Tensor (*)( + const MaterializedITensorListRef& qxs, + int64_t dim, + double scale, + int64_t zero_point); +using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool); + +using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&); + +using qnormalize_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qmean_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* opt_dim */, + bool /* keepdim */, + std::optional /* opt_dtype */, + Tensor& /* Y */); + +using qstd_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* dim */, + const std::optional& /* correction */, + bool /* keepdim */, + Tensor& /* Y */); + +using qnormalize_nhwc_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Tensor& /*qw*/); + +using qbinary_eltwise_cpu_fn = void (*)( + Tensor& /*out*/, + const Tensor& /*qx*/, + double /*qx_scale*/, + int64_t /*qx_zero_point*/, + const Tensor& /*qy*/, + double /*qy_scale*/, + int64_t /*qy_zero_point*/, + double /*output_scale*/, + int64_t /*output_zero_point*/); + +using qbatch_norm_cpu_fn = void(*)( + int64_t /*N*/, + int64_t /*C*/, + int64_t /*H * W*/, + int64_t /*in_zero_point*/, + int64_t /*out_zero_point*/, + const Tensor& /*input*/, + const Tensor& /*a*/, + const Tensor& /*b*/, + Tensor& /*output*/); + +DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub) +DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub) +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub) +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub) +DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub) +DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub) +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub) +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub) +DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub) +DECLARE_DISPATCH(qbinary_fn, qadd_stub) +DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub) +DECLARE_DISPATCH(qbinary_fn, qmul_stub) +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub) +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub) +DECLARE_DISPATCH(qclamp_fn, qclamp_stub) +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub) +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub) +DECLARE_DISPATCH(qelu_fn, qelu_stub) +DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub) +DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub) +DECLARE_DISPATCH(qdropout_fn, qdropout_stub) +DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub) +DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub) +DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub) +DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub) +DECLARE_DISPATCH(qrelu_fn, qrelu_stub) +DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub) +DECLARE_DISPATCH(qgelu_fn, qgelu_stub) +DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub) +DECLARE_DISPATCH(qtanh_fn, qtanh_stub) +DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub) +DECLARE_DISPATCH(qtopk_fn, qtopk_stub) +DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub) +DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub) +DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub) +DECLARE_DISPATCH(qprelu_fn, qprelu_stub) +DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qmul_tensor_cpu_stub) +DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qadd_tensor_cpu_stub) +DECLARE_DISPATCH(qbinary_eltwise_cpu_fn, qadd_relu_tensor_cpu_stub) +DECLARE_DISPATCH(qbatch_norm_cpu_fn, qbatch_norm_cpu_stub) + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..4baae7cf065a3b03d50ea8880af7016a329b9be1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_RUY_QMATMUL + +#include + +namespace at::native::ruy_utils { + +ruy::Context* get_ruy_context(); + +void quantize_multiplier(double scale, + int* multiplier_fixedpoint, + int* multiplier_exponent); + +} // namespace at::native::ruy_utils + +#endif // USE_RUY_QMATMUL + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..36fabe41f6727e178890c80e4df77a11e4daf447 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h @@ -0,0 +1,336 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_XNNPACK +#include + +#include +#include + +using xnnpack_operator = at::native::xnnpack::Operator; + +namespace at::native::xnnp_utils { + +/* + * Return shape in the same order as the memory format + * e.g. channels_last will return NHWC instead of NCHW + */ +std::vector get_mem_format_aware_shape(const at::Tensor& in); + +/* + * Input is always int8_t, output can be [int8_t, uint8_t]. + * input + offset = output + * int8_t + 128 = uint8_t + * int8_t + 0 = int8_t + */ +template +void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out); + +template +Tensor convert_conv_weights_to_channel_last_tensor( + const at::Tensor& src, + int groups, + bool transpose); + +/* + * Series of create wrapper functions to call xnn_create_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_convolution2d_nhwc( + uint32_t pad_top, + uint32_t pad_right, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t kernel_h, + uint32_t kernel_w, + uint32_t stride_h, + uint32_t stride_w, + uint32_t dilation_h, + uint32_t dilation_w, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t ip_chan_stride, + size_t op_chan_stride, + int8_t izp, + float ip_scale, + int8_t kzp, + const float* k_scales, + const int8_t* kernel, + const int32_t* bias, + int8_t ozp, + float op_scale, + int8_t op_min, + int8_t op_max, + uint32_t flags, + xnn_operator_t* op, + bool per_channel, + bool transpose) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero." + "But got: ", kzp); + + if (transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_create_deconvolution2d_nhwc_qs8( + pad_top, /* uint32_t output_padding_top */ + pad_right, /* uint32_t output_padding_right */ + pad_bottom, /* uint32_t output_padding_bottom */ + pad_left, /* uint32_t output_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t stride_height */ + stride_w, /* uint32_t stride_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels */ + ip_chan_stride, /* size_t input_pixel_stride */ + op_chan_stride, /* size_t output_pixel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* deconvolution_op_out */ + + } + + if (!per_channel) { + return xnn_create_convolution2d_nhwc_qs8( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* convolution_op_out */ + } else { /* per_channel */ + return xnn_create_convolution2d_nhwc_qs8_qc8w( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales, /* const float* kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* convolution_op_out */ + } +} + +/* + * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_reshape_convolution2d_nhwc( + xnn_operator_t op, + size_t batch, + size_t in_h, + size_t in_w, + pthreadpool_t pt_pool, + bool per_channel = false, + bool transpose = false, + uint32_t adj_h = 0, + uint32_t adj_w = 0) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_reshape_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + adj_h, /* uint32_t adjustment_height */ + adj_w, /* uint32_t adjustment_width */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + + if (!per_channel) { + return xnn_reshape_convolution2d_nhwc_qs8( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + &workspace_size, /* size_t* workspace_size */ + &workspace_alignment, /* size_t* workspace_alignment */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } else { /* per_channel */ + return xnn_reshape_convolution2d_nhwc_qs8_qc8w( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + &workspace_size, /* size_t* workspace_size */ + &workspace_alignment, /* size_t* workspace_alignment */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } +} + + +/* + * Series of setup wrapper functions to call xnn_setup_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_convolution2d_nhwc( + xnn_operator_t op, + const int8_t* inp, + int8_t* outp, + bool per_channel = false, + bool transpose = false) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + + return xnn_setup_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } + + if (!per_channel) { + return xnn_setup_convolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + nullptr, /* void workspace */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } else { /* per_channel */ + return xnn_setup_convolution2d_nhwc_qs8_qc8w( + op, /* xnn_operator_t deconvolution_op */ + nullptr, /* void workspace */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } +} + + +/* + * Series of wrapper functions to call xnn_create* and xnn_setup* + * functions for linear + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_fully_connected_nc( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + int8_t input_zero_point, + float input_scale, + int8_t kernel_zero_point, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* fully_connected_op_out) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero." + "But got: ", kernel_zero_point); + return xnn_create_fully_connected_nc_qs8( + input_channels, /* size_t input_channels */ + output_channels, /* size_t output_channels */ + input_stride, /* size_t input_stride */ + output_stride, /* size_t output_stride */ + input_zero_point, /* int8_t input_zero_point */ + input_scale, /* float input_scale */ + kernel_scale, /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + output_zero_point, /* int8_t output_zero_point */ + output_scale, /* float output_scale */ + output_min, /* int8_t output_min */ + output_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t */ + fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_reshape_fully_connected_nc( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool) { + return xnn_reshape_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + batch_size, /* size_t batch_size */ + threadpool); /* pthreadpool_t threadpool */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_fully_connected_nc( + xnn_operator_t fully_connected_op, + const int8_t* input, + int8_t* output) { + return xnn_setup_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + input, /* const int8_t* input */ + output /* int8_t* output */ + ); +} + +} // namespace at::native::xnnp_utils + +#endif // USE_XNNPACK + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h new file mode 100644 index 0000000000000000000000000000000000000000..72d32a7591d3cafd0a4a20322492ec0378a1857f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h @@ -0,0 +1,422 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include +#include +#if !defined(__s390x__) && !defined(__powerpc__) +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +#include + +/* Convolution prepacked parameters serialization. + * + * Version 1 + * + * - Fields: + * 1. weight + * 2. bias + * 3. stride x kSpatialDim + * 4. padding x kSpatialDim + * 5. dilation x kSpatialDim + * 6. groups + * + * Version 2 + * + * - Fields: + * 0. version (string) + * 1. list of non-optional tensors + * 0: packed parameters (int16_t) + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - transpose (0 or 1) + * 1: weight + * 2. list of optional tensors + * 0: bias + * + * Version 3 + * + * - Fields: + * 0. version (int64_t) + * 1. list of int64_t configuration values + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - flags (bitmask) + * - (1 << 0) transpose (1 = yes) + * 2. list of optional tensors + * 0: None (helps with type inference) + * 1: weight (this must be present) + * 2: bias + */ + +using ConvParamsSerializationTypeV2 = std::tuple< + // version, for versions 2 and up + std::string, + // non-optional tensors + std::vector, + // optional tensors + std::vector>>; + +using ConvParamsSerializationTypeV3 = std::tuple< + // version, int for versions 3 and up + int64_t, + // configuration values + std::vector, + // optional tensors + std::vector>>; + +// Parses any historical conv packed params format into +// the current format. +template +ConvParamsSerializationTypeV3 parse_conv_serialized_state(const c10::IValue& v) { + + // determine the version based on IValue contents + int version = -1; + if (v.isTuple()) { + const auto& elements = v.toTupleRef().elements(); + if (!elements.empty()) { + auto firstElement = elements[0]; + if (firstElement.isTensor()) { + version = 1; + } else if (firstElement.isString()) { + const std::string& version_str = firstElement.toStringRef(); + // note: not parsing the string to automatically handle bad + // inputs + if (version_str == "2") { + version = 2; + } + } else if (firstElement.isInt()) { + auto raw_version = firstElement.toInt(); + if (raw_version == 3) { + version = 3; + } + } + } + } + TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version"); + + if (version == 1) { + // version 1 - convert to version 3 manually + + const auto& elements = v.toTupleRef().elements(); + + at::Tensor weight = elements[0].toTensor(); + std::optional bias = elements[1].toOptional(); + torch::List stride_x_kSpatialDim = elements[2].toTensorList(); + torch::List padding_x_kSpatialDim = elements[3].toTensorList(); + torch::List dilation_x_kSpatialDim = elements[4].toTensorList(); + at::Tensor groups = elements[5].toTensor(); + + std::vector config_vals; + config_vals.reserve( + stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() + + dilation_x_kSpatialDim.size() + kSpatialDim + 3); + config_vals.push_back(kSpatialDim); + for (const auto i : c10::irange(stride_x_kSpatialDim.size())) { + auto const & stride = stride_x_kSpatialDim.get(i); + config_vals.push_back(stride[0].item()); + } + for (const auto i : c10::irange(padding_x_kSpatialDim.size())) { + auto const &padding = padding_x_kSpatialDim.get(i); + config_vals.push_back(padding[0].item()); + } + for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) { + auto const &dilation = dilation_x_kSpatialDim.get(i); + config_vals.push_back(dilation[0].item()); + } + // output_padding does not exist in v1, so we fill in a default value + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + config_vals.push_back(0); + } + config_vals.push_back(groups[0].item()); + // transpose does not exist in v1, so we fill in a default value + config_vals.push_back(0); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 2) { + // version 2 + const auto& elements = v.toTupleRef().elements(); + std::vector non_optional = elements[1].toTensorList().vec(); + std::vector> optional; + + if (elements[2].isTensorList()) { + for (const auto& elem : elements[2].toTensorList()) { + optional.emplace_back(static_cast(elem)); + } + } else { + for (const auto& elem : elements[2].toList()) { + optional.emplace_back(static_cast(elem).toOptional()); + } + } + // create default optional value for bias + if (optional.empty()) { + optional.emplace_back(); + } + + auto config_a = non_optional[0].accessor(); + std::vector config_vals; + config_vals.reserve(config_a.size(0)); + for (const auto i : c10::irange(config_a.size(0))) { + config_vals.emplace_back(config_a[i]); + } + + auto weight = non_optional[1]; + auto bias = optional[0]; + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 3) { + return v.to(); + } else { + TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ", + version); + } +} + +#define QCONV_SERIALIZATION_VERSION 2 + +#if QCONV_SERIALIZATION_VERSION == 2 +using ConvParamsSerializationType = ConvParamsSerializationTypeV2; + +template +ConvParamsSerializationTypeV2 serialize_conv( + const c10::intrusive_ptr>& params) { + + std::string version = "2"; + std::vector non_optional; + std::vector> optional; + + // create a packed int8_t tensor for conv params + std::vector params_vec; + params_vec.push_back(kSpatialDim); + auto stride = params->stride().vec(); + params_vec.insert(params_vec.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + params_vec.insert(params_vec.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + params_vec.insert(params_vec.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + params_vec.insert(params_vec.end(), output_padding.begin(), + output_padding.end()); + params_vec.push_back(params->groups()); + params_vec.push_back(params->transpose()); + int64_t vec_size = params_vec.size(); + at::Tensor params_tensor = at::from_blob( + params_vec.data(), {vec_size}, + at::TensorOptions().dtype(at::kShort)) + // clone to retain ownership of the data + .clone(); + + auto [weight, bias] = params->unpack(); + + non_optional.emplace_back(std::move(params_tensor)); + non_optional.emplace_back(std::move(weight)); + optional.emplace_back(std::move(bias)); + + return std::tie(version, non_optional, optional); +} + +#elif QCONV_SERIALIZATION_VERSION == 3 +using ConvParamsSerializationType = ConvParamsSerializationTypeV3; + +template +ConvParamsSerializationTypeV3 serialize_conv( + const c10::intrusive_ptr>& params) { + std::vector config_vals; + config_vals.push_back(kSpatialDim); + auto stride = params->stride().vec(); + config_vals.insert(config_vals.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + config_vals.insert(config_vals.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + config_vals.insert(config_vals.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + config_vals.insert(config_vals.end(), output_padding.begin(), + output_padding.end()); + config_vals.push_back(params->groups()); + config_vals.push_back(params->transpose()); + + auto [weight, bias] = params->unpack(); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); +} + +#else +#error "Invalid qconv serialization version." +#endif + +template +c10::intrusive_ptr> deserialize_conv( + ConvParamsSerializationTypeV3 state) { + auto & [version, config_vals, tensors] = state; + TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version); + + TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size()); + auto & weight = tensors[1]; + auto & bias [[maybe_unused]] = tensors[2]; + TORCH_INTERNAL_ASSERT(weight.has_value(), "Weight should always be present in serialized qconv."); + + torch::List stride, padding, output_padding, dilation; + // skip kSpatialDim + int idx = 1; + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + stride.emplace_back(config_vals.at(idx)); + idx++; + } + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + padding.emplace_back(config_vals.at(idx)); + idx++; + } + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + dilation.emplace_back(config_vals.at(idx)); + idx++; + } + for ([[maybe_unused]] const auto i : c10::irange(kSpatialDim)) { + TORCH_INTERNAL_ASSERT( + idx < static_cast(config_vals.size()), + "Unexpected index = ", + idx, + " for config_vals of size ", + config_vals.size()); + output_padding.emplace_back(config_vals.at(idx)); + idx++; + } + int64_t groups [[maybe_unused]] = config_vals.at(idx); + idx++; + int64_t flags [[maybe_unused]] = config_vals.at(idx); + idx++; + TORCH_INTERNAL_ASSERT(idx == static_cast(config_vals.size()), + "Unexpected length of config_vals, expected ", + idx, + " got ", + config_vals.size()); + + bool transpose [[maybe_unused]] = flags & (1 << 0); + + int64_t other_flags = flags & ~(1 << 0); + TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, "."); + + auto& ctx = at::globalContext(); + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::X86) { +#if AT_MKLDNN_ENABLED() + bool use_onednn = onednn_utils::should_use_onednn_quant( + weight.value(), transpose, groups, output_padding); + if (use_onednn) { + return PackedConvWeightsOnednn::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif + return PackedConvWeight::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } // x86 +#endif + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::FBGEMM) { + return PackedConvWeight::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_FBGEMM +#ifdef USE_PYTORCH_QNNPACK + if (ctx.qEngine() == at::QEngine::QNNPACK) { + TORCH_CHECK( + kSpatialDim == 2, + "prepack/__setstate__: QNNPACK only supports Conv2d " + "now."); + return PackedConvWeightsQnnp::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn::prepack( + std::move(weight.value()), + std::move(bias), + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // AT_MKLDNN_ENABLED() +TORCH_CHECK( + false, + "Didn't find engine for when deserializing ConvPackedParams: ", + toString(ctx.qEngine())); +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..5130c769ffbfd2cd396cd8e41195a722150985d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h @@ -0,0 +1,415 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +#ifdef USE_FBGEMM +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wextra-semi") +#include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winconsistent-missing-destructor-override") +#include +C10_DIAGNOSTIC_POP() +#include +C10_DIAGNOSTIC_POP() + +// The struct for the packed weight matrix (PackBMatrix) and the corresponding +// column offsets used for the fully connect layer, which are both prepared in +// the prepacking step to save the computations in the inference. Note the +// column offsets include the sum of the B columns as well as the scalar term +// B_zero_point * K, whereas the row offsets created by +// PackAWithQuantRowOffset/PackAWithIm2Col/PackAWithRowOffset are only the sum +// of the A rows. The column offsets are needed for the asymmetric quantization +// (affine quantization) of input matrix. +// Note that in JIT mode we can think of a way to fuse col_offsets with bias. +struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { + PackedLinearWeight( + std::unique_ptr> w, + std::optional bias, + std::vector col_offsets, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme) + : w(std::move(w)), + bias_(std::move(bias)), + col_offsets(std::move(col_offsets)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(std::move(q_scheme)) {} + std::unique_ptr> w; + std::optional bias_; + std::vector col_offsets; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor& apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor& apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + template + at::Tensor& apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output); + + template + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32_impl( + const at::Tensor& input, + double input_scale, + int64_t input_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false); +}; + +struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase { + PackedLinearWeightFp16( + std::unique_ptr w, + std::optional bias) + : w(std::move(w)), bias_(std::move(bias)) {} + + std::unique_ptr w; + std::optional bias_; + + at::Tensor apply( + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { + TORCH_INTERNAL_ASSERT(false); + } + at::Tensor apply_relu( + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { + TORCH_INTERNAL_ASSERT(false); + } + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor& apply_dynamic_out( + const at::Tensor& input, + at::Tensor& output, + bool reduce_range = false) override; + at::Tensor& apply_dynamic_relu_out( + const at::Tensor& input, + at::Tensor& output, + bool reduce_range = false) override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + void set_bias(std::optional bias) override; + + private: + template + at::Tensor& apply_dynamic_impl(const at::Tensor& input, at::Tensor& output); +}; + +template +struct TORCH_API PackedConvWeight : public ConvPackedParamsBase { + PackedConvWeight( + std::unique_ptr> w, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose, + std::vector col_offsets, + std::vector kernel, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme) + : w(std::move(w)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + col_offsets(std::move(col_offsets)), + kernel(std::move(kernel)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(q_scheme) {} + + std::unique_ptr> w; + std::optional bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + std::vector col_offsets; + std::vector kernel; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + const float* GetBiasData(at::Tensor* bias); + + void GetQuantizationParams( + float act_scale, + float out_scale, + std::vector* output_multiplier_float, + std::vector* act_times_w_scale); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +}; + +// PackWeight: Convert the weight from uint8 to int8. +inline void convert_uint8_int8( + int len, + const uint8_t* src_uint8, + int8_t* dst_int8) { + for (const auto i : c10::irange(len)) { + dst_int8[i] = static_cast(static_cast(src_uint8[i]) - 128); + } +} + +// UnpackWeight: Convert the weight from int8 to uint8. +inline void convert_int8_uint8( + int len, + const int8_t* src_int8, + uint8_t* dst_uint8) { + for (const auto i : c10::irange(len)) { + dst_uint8[i] = + static_cast(static_cast(src_int8[i]) + 128); + } +} + +namespace at::native::fbgemm_utils { + +template +fbgemm::conv_param_t MakeFbgemmConvParam( + int N, + int C, + int M, + const std::vector& image_shape, + int groups, + const std::vector& kernels, + const std::vector& strides, + const std::vector& pads, + const std::vector& dilations, + const std::vector& output_padding = std::vector(kSpatialDim, 0), + bool transposed = false); + +// TODO: Remove functions below when ChannelsLast3d is ready. +Tensor MakeStridedQTensorCPU( + const IntArrayRef& sizes, + const IntArrayRef& strides, + const TensorOptions& options, + QuantizerPtr quantizer); + +Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor( + int64_t N, + int64_t C, + int64_t D, + int64_t H, + int64_t W, + const TensorOptions& options, + double scale, + int64_t zero_point); + +Tensor MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor( + int64_t N, + int64_t C, + int64_t D, + int64_t H, + int64_t W, + const TensorOptions& options, + const Tensor& scales, + const Tensor& zero_points); + +Tensor ConvertToChannelsLast3dTensor(const Tensor& src); + +template +Tensor TransposeConvTensorUnpackConversion(const Tensor& src, int groups); + +template +Tensor ConvertConvWeightsToChannelLastTensor( + const at::Tensor& src, + int groups, + bool transpose); +} // at::native::namespace fbgemm_utils + +#endif // USE_FBGEMM + +struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { + PackedEmbeddingBagWeight( + at::Tensor packed_w, + std::vector w_scale, + std::vector w_zp, + int64_t bit_rate, + c10::QScheme q_scheme, + int64_t version) + : packed_w(std::move(packed_w)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + bit_rate_(bit_rate), + q_scheme(q_scheme), + version_(version) { + if (!this->packed_w.is_contiguous()) { + this->packed_w = this->packed_w.contiguous(); + } + } + + at::Tensor packed_w; + std::vector w_scale; + std::vector w_zp; + int64_t bit_rate_; + c10::QScheme q_scheme; + int64_t version_; + + at::Tensor unpack() override; + static c10::intrusive_ptr prepack( + const at::Tensor& weight); + + int64_t bit_rate() const override { + return bit_rate_; + } + + int64_t version() const override { + return version_; + } + + at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) override; + + at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) override; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h new file mode 100644 index 0000000000000000000000000000000000000000..b6d62a7a085864da1194565fb8a34ccf03632342 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h @@ -0,0 +1,16 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_PYTORCH_QNNPACK + +namespace at::native { + +void initQNNPACK(); + +} // namespace at::native + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qconv.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qconv.h new file mode 100644 index 0000000000000000000000000000000000000000..bb480f4868ea9ee7517e7eb2d98daad9f239da48 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qconv.h @@ -0,0 +1,105 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at { +namespace native { + +class QConvoneDNN final { + public: + + C10_API static at::Tensor run_pointwise( + at::Tensor act, // contains quantized values but not QTensor + double act_scale, + int64_t act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_tensor( + at::Tensor act, // contains quantized values but not QTensor + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view attr, + torch::List> scalars, + std::optional algorithm); + + C10_API static at::Tensor run_pointwise_binary( + at::Tensor act, // contains quantized values but not QTensor + double act_scale, + int64_t act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, // contains quantized values but not QTensor + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + + C10_API static at::Tensor run_pointwise_binary_tensor( + at::Tensor act, // contains quantized values but not QTensor + at::Tensor act_scale, + at::Tensor act_zero_point, + at::Tensor weight, // contains quantized values but not QTensor + at::Tensor weight_scales, + at::Tensor weight_zero_points, + at::Tensor accum, // contains quantized values but not QTensor + std::optional bias, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double accum_scale, + int64_t accum_zero_point, + std::string_view binary_attr, + std::optional alpha, + std::optional unary_attr, + torch::List> unary_scalars, + std::optional unary_algorithm); + +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h new file mode 100644 index 0000000000000000000000000000000000000000..8f474b79d46f4ad2791cb28f34ca3530a363a40e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { +Tensor& embedding_bag_byte_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const std::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& embedding_bag_4bit_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const std::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h new file mode 100644 index 0000000000000000000000000000000000000000..f581236788a75dd17bfc6b9bec831571c501fbeb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h @@ -0,0 +1,20 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at::native { + +Tensor& qembeddingbag_byte_prepack_out( + Tensor& output, + const Tensor& weight, + const std::optional& rowwise_min_max_opt = std::nullopt); + +Tensor qembeddingbag_byte_prepack(const Tensor& weight); + +Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qlinear.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qlinear.h new file mode 100644 index 0000000000000000000000000000000000000000..51d3079767fe6ec32d05194c2576d158bc07c3c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cpu/qlinear.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +class QLinearOnednn final { + public: + C10_API static Tensor run_pointwise_tensor( + Tensor act, // int8 CPU tensor, not QTensor + Tensor act_scale, + Tensor act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + std::string_view post_op_name, + c10::List> post_op_args, + std::string_view post_op_algorithm); + +C10_API static Tensor run_pointwise_binary_tensor( + Tensor act, // int8 CPU tensor, not QTensor + Tensor act_scale, + Tensor act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + std::optional other, // extra input for binary post-op + std::optional bias, + double output_scale, + int64_t output_zero_point, + std::optional output_dtype, + double other_scale, + int64_t other_zero_point, + std::string_view binary_post_op, // e.g. "none", "sum", "add" + double binary_alpha, + std::string_view unary_post_op, // e.g. "none", "relu" + c10::List> unary_post_op_args, + std::string_view unary_post_op_algorithm); +}; + +C10_API Tensor _weight_int4pack_mm_cpu_tensor( + const Tensor& A, + const Tensor& B, + const Tensor& qGroupSize, + const Tensor& qScaleAndZeros); + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cudnn/utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cudnn/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..fe55d839b2678abb11bd83054279b37c83234e0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/cudnn/utils.h @@ -0,0 +1,320 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +/* +This file contains some of the auxiliary functions used by both Conv.cpp & Linear.cpp (introduced in a later PR) +*/ + +#ifdef USE_CUDA +#include // for the definition of AT_CUDNN_ENABLED + +#if AT_CUDNN_ENABLED() + +#include +#include +#include +#include +#include +#include + +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override") +#include +C10_DIAGNOSTIC_POP() + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +struct PackedLinearWeightCudnn : public LinearPackedParamsBase { + PackedLinearWeightCudnn( + at::Tensor orig_weight, + std::optional bias, + c10::QScheme q_scheme) + : orig_weight(std::move(orig_weight)), + bias_(std::move(bias)), + q_scheme(std::move(q_scheme)) {} + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) override { + TORCH_CHECK(false, "apply_dynamic is not implemented for this packed parameter type"); + } + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) override { + TORCH_CHECK(false, "apply_dynamic_relu is not implemented for this packed parameter type"); + } + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + at::Tensor orig_weight; + std::optional bias_; + c10::QScheme q_scheme; + + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + + template + void apply_impl_helper( + const at::Tensor& quantized_output, + const at::Tensor& input, + double output_scale); +}; + +template +struct PackedConvWeightCudnn : public ConvPackedParamsBase { + PackedConvWeightCudnn( + at::Tensor orig_weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + c10::QScheme q_scheme, + int64_t output_channels) + : maybe_padded_weight_(std::move(orig_weight)), + bias_(std::move(bias)), + stride_(stride), + padding_(padding), + output_padding_(output_padding), + dilation_(dilation), + groups_(groups), + transpose_(transpose), + q_scheme_(q_scheme), + num_unpadded_output_channels_(output_channels) {} // output channels needs to be stored when we have to pad this dimension + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override { + TORCH_CHECK(false, "apply_dynamic is currently not reported"); + } + + at::Tensor apply_dynamic_relu( + const at::Tensor& input, + bool reduce_range) { + TORCH_CHECK(false, "apply_dynamic_relu is currently not reported"); + } + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + const float* GetBiasData(at::Tensor* bias); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + private: + // cudnn v8.4.0 expects conv2d's int8 weight tensor's input and output channels to be a multiple of 4. if it is not + // we need to explicitly pad it to a multiple of 4 ourselves as cudnn does not currently support padding, hence the naming + // convention "maybe"_padded_weight. + // TODO: when and if cudnn enables padding in their operators, we can remove padding on our end and rename this to orig_weight_ + at::Tensor maybe_padded_weight_; + std::optional bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + c10::QScheme q_scheme_; + int64_t num_unpadded_output_channels_; + + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + + template + void apply_impl_helper( + const at::Tensor& quantized_output, + const at::Tensor& input, + double output_scale); +}; + +namespace cudnn_utils { + +// TODO: we can remove this function when cuDNN enables pass by value support for +// pointwise multiplication operations. the only reason why we need this right now is +// we use broadcasting scalar multiplication in conv, linear, and add ops, and cuDNN requires +// the scalar to be a scalar tensor with the same number of dimensions (num_dim) as the tensor we're multiplying to +inline at::Tensor getRequantMultiplierTensor(double requant_multiplier, uint8_t num_dim) { + at::SmallVector requantize_multiplier_tensor_size(num_dim, 1); + at::Tensor requantize_multiplier_tensor = at::empty(requantize_multiplier_tensor_size, at::device(at::kCUDA).dtype(at::kFloat)); + requantize_multiplier_tensor.fill_(requant_multiplier); + return requantize_multiplier_tensor; +} + +inline uint8_t getAlignment(const at::Tensor &t) { + // alignment are in bytes + uint8_t alignment = 1; + uintptr_t address = reinterpret_cast(t.data_ptr()); + for (; alignment < 16; alignment *= 2) { + if (address % (alignment * 2)) { + return alignment; + } + } + return alignment; +} + +// For the two getTensorDescriptor functions, there is a is_virtual parameter. This parameter is used to set the cudnn +// tensor as virtual or not. Setting the tensor as virtual is expected to have some performance benefits as the cudnn +// backend cudnn will no longer directly save to the tensor, allowing us to omit this tensor from the variant pack. +// See third_party/cudnn_frontend/samples/fusion_sample.cpp for other examples + +inline cudnn_frontend::Tensor getTensorDescriptor(const at::Tensor &t, int64_t id, uint8_t alignment, bool is_virtual = false) { + auto shape = t.sizes(); + auto strides = t.strides(); + if (is_virtual) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setVirtual() + .setDataType(at::native::getCudnnDataType(t)) + .build(); + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(at::native::getCudnnDataType(t)) + .build(); +} + +inline cudnn_frontend::Tensor getTensorDescriptor(const c10::IntArrayRef& shape, const c10::IntArrayRef& strides, cudnnDataType_t cudnn_dtype, int64_t id, uint8_t alignment, bool is_virtual = false) { + if (is_virtual) { + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setVirtual() + .setDataType(cudnn_dtype) + .build(); + } + return cudnn_frontend::TensorBuilder() + .setDim(shape.size(), shape.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(cudnn_dtype) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +inline cudnn_frontend::PointWiseDesc_v8 getPointWiseMulDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_MUL) + .setMathPrecision(dataType) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +inline cudnn_frontend::PointWiseDesc_v8 getPointWiseAddDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_ADD) + .setMathPrecision(dataType) + .build(); +} + +// TODO: there is a table from input dtype to operator dtype, we can derive +// the operator dtype based on input dtype +inline cudnn_frontend::PointWiseDesc_v8 getPointWiseReluDescriptor(cudnnDataType_t dataType) { + return cudnn_frontend::PointWiseDescBuilder() + .setMode(cudnnPointwiseMode_t::CUDNN_POINTWISE_RELU_FWD) + .setMathPrecision(dataType) + .build(); +} + + +inline void filterEngineConfigs( + cudnn_frontend::EngineConfigList &from, + cudnn_frontend::EngineConfigList &to, + bool deterministic, bool allow_tf32, c10::ScalarType scalar_type) +{ + auto filter = [=](cudnnBackendDescriptor_t c) { + if (deterministic) { + if (cudnn_frontend::hasNumericalNote(c)) return true; + } + if (scalar_type == at::kFloat || scalar_type == at::kChar || !allow_tf32) { + if (cudnn_frontend::hasNumericalNote(c)) return true; + if (cudnn_frontend::hasNumericalNote(c)) return true; + } + return false; + }; + cudnn_frontend::filter(from, to, filter); +} + +} // cudnn_utils + +#endif // AT_CUDNN_ENABLED +#endif // USE_CUDA + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/library.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/library.h new file mode 100644 index 0000000000000000000000000000000000000000..892bc13e4ec0f2b817279739c089eef14b420a8f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/quantized/library.h @@ -0,0 +1,13 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +TORCH_API int register_linear_params(); +int register_embedding_params(); + +template TORCH_API int register_conv_params(); + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/attention.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/attention.h new file mode 100644 index 0000000000000000000000000000000000000000..01af728fa9edf7729a8973d5eef435473be1268c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/attention.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include + +namespace at::native { + +using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value, + const std::optional& attn_mask_, double dropout_p, bool is_causal, std::optional scale, bool enable_gqa); + +DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub) + +TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b); +TORCH_API Tensor masked_softmax( + Tensor& attn_scores, + std::optional attn_mask, + const Tensor& query, + std::optional mask_type = {}); + +using transform_bias_rescale_qkv_fn = void(*)( + at::ScalarType type, + void* _q_k_v, + const void* _qkv, + const void* _qkv_bias, + int64_t B, + int64_t T, + int64_t D, + int64_t num_head); + +DECLARE_DISPATCH(transform_bias_rescale_qkv_fn, transform_bias_rescale_qkv_stub) + +TORCH_API Tensor transform0213_gemm_nt_bias( + const Tensor& a, + const Tensor& b, + const Tensor& c, + const Tensor& query); + +TORCH_API Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b); + +TORCH_API void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape); + +TORCH_API Tensor qkv_projection( + const Tensor& query, + const Tensor& key, + const Tensor& value, + const int64_t embed_dim, + const Tensor& qkv_weight); + +using flash_attention_fn = void (*)( + const Tensor& output, const Tensor& logsumexp, + const Tensor& query, const Tensor& key, const Tensor& value, + double dropout_p, bool is_causal, + std::optional attn_mask, + std::optional scale); + +using flash_attention_backward_fn = void (*)( + const Tensor& grad_q, const Tensor& grad_k, + const Tensor& grad_v, const Tensor& grad_out, + const Tensor& query, const Tensor& key, + const Tensor& value, const Tensor& out, const Tensor& logsumexp, + double dropout_p, bool is_causal, + std::optional attn_mask, + std::optional scale); + +DECLARE_DISPATCH(flash_attention_fn, flash_attention_kernel) +DECLARE_DISPATCH(flash_attention_backward_fn, flash_attention_backward_kernel) + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/flash_api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/flash_api.h new file mode 100644 index 0000000000000000000000000000000000000000..84dc545da12a5e298057e73cf0fd339cebc6c88a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/flash_api.h @@ -0,0 +1,101 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#include +#include +#include + +namespace FLASH_NAMESPACE { + +TORCH_API +std::tuple +mha_fwd(const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &out_, // batch_size x seqlen_q x num_heads x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, + const float softmax_scale, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool return_softmax, + std::optional gen_); + +std::tuple +mha_varlen_fwd(const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + std::optional &seqused_k, // b. If given, only this many elements of each batch element's keys are used. + std::optional &block_table_, // batch_size x max_num_blocks_per_seq + std::optional &alibi_slopes_, // num_heads or b x num_heads + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool return_softmax, + std::optional gen_); + + +std::tuple +mha_bwd(const at::Tensor &dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor &q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor &out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x seqlen_q + std::optional &dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional &dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional &alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset); + +std::tuple +mha_varlen_bwd(const at::Tensor &dout, // total_q x num_heads, x head_size + const at::Tensor &q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor &k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &out, // total_q x num_heads x head_size + const at::Tensor &softmax_lse, // b x h x s softmax logsumexp + std::optional &dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional &dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional &dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor &cu_seqlens_q, // b+1 + const at::Tensor &cu_seqlens_k, // b+1 + std::optional &alibi_slopes_, // num_heads or b x num_heads + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + int window_size_left, + int window_size_right, + const float softcap, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset); + +} // namespace FLASH_NAMESPACE + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/static_switch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/static_switch.h new file mode 100644 index 0000000000000000000000000000000000000000..7d9c91b023a6f06cb54ae8b381c40eeda456f121 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/flash_attn/static_switch.h @@ -0,0 +1,112 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +// Inspired by +// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h +// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h + +#pragma once + +/// @param COND - a boolean expression to switch by +/// @param CONST_NAME - a name given for the constexpr bool variable. +/// @param ... - code to execute for true and false +/// +/// Usage: +/// ``` +/// BOOL_SWITCH(flag, BoolConst, [&] { +/// some_function(...); +/// }); +/// ``` + +#define BOOL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + if (COND) { \ + constexpr static bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + } else { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + } \ + }() + +#ifdef FLASHATTENTION_DISABLE_DROPOUT + #define DROPOUT_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + }() +#else + #define DROPOUT_SWITCH BOOL_SWITCH +#endif + +#ifdef FLASHATTENTION_DISABLE_ALIBI + #define ALIBI_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + }() +#else + #define ALIBI_SWITCH BOOL_SWITCH +#endif + +#ifdef FLASHATTENTION_DISABLE_UNEVEN_K + #define EVENK_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + }() +#else + #define EVENK_SWITCH BOOL_SWITCH +#endif + +#ifdef FLASHATTENTION_DISABLE_LOCAL + #define LOCAL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + }() +#else + #define LOCAL_SWITCH BOOL_SWITCH +#endif + +#define FP16_SWITCH(COND, ...) \ + [&] { \ + if (COND) { \ + using elem_type = cutlass::half_t; \ + return __VA_ARGS__(); \ + } else { \ + using elem_type = cutlass::bfloat16_t; \ + return __VA_ARGS__(); \ + } \ + }() + +#define HEADDIM_SWITCH(HEADDIM, ...) \ + [&] { \ + if (HEADDIM <= 32) { \ + constexpr static int kHeadDim = 32; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 64) { \ + constexpr static int kHeadDim = 64; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 96) { \ + constexpr static int kHeadDim = 96; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 128) { \ + constexpr static int kHeadDim = 128; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 160) { \ + constexpr static int kHeadDim = 160; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 192) { \ + constexpr static int kHeadDim = 192; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 224) { \ + constexpr static int kHeadDim = 224; \ + return __VA_ARGS__(); \ + } else if (HEADDIM <= 256) { \ + constexpr static int kHeadDim = 256; \ + return __VA_ARGS__(); \ + } \ + }() + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f1f04665613538d11b33a6ea9ecb351b8fda11da --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/debug_utils.h @@ -0,0 +1,215 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// +// Debugging functions +//////////////////////////////////////////////////////////////////////////////// +// Nans & inf detection +#define NANCHECK(frag) \ + { \ + for (int _i = 0; _i < frag.size(); ++_i) { \ + assert(std::isfinite(float(frag[_i]))); \ + assert(!std::isnan(float(frag[_i]))); \ + } \ + } + +// Print on the first thread of the first block +#if 1 +#define PRINT_WARP_ID 0 +#define PRINT_LANE_ID 0 +#define PRINT_B0_T0(msg, ...) \ + if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && \ + threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \ + threadIdx.z == 0) { \ + printf(msg "\n", ##__VA_ARGS__); \ + } +#define PRINT_T0(msg, ...) \ + if (threadIdx.x == PRINT_LANE_ID && threadIdx.y == PRINT_WARP_ID && \ + threadIdx.z == 0) { \ + printf(msg "\n", ##__VA_ARGS__); \ + } +#define PRINT_TX_LX(msg, ...) \ + for (int bx = 0; bx < gridDim.x; ++bx) { \ + for (int by = 0; by < gridDim.y; ++by) { \ + for (int bz = 0; bz < gridDim.z; ++bz) { \ + for (int tx = 0; tx < blockDim.x; ++tx) { \ + for (int ty = 0; ty < blockDim.y; ++ty) { \ + for (int tz = 0; tz < blockDim.z; ++tz) { \ + __syncthreads(); \ + if (blockIdx.x == bx && blockIdx.y == by && blockIdx.z == bz && \ + threadIdx.x == tx && threadIdx.y == ty && \ + threadIdx.z == tz) { \ + printf( \ + "[%d,%d,%d][%d,%d,%d]" msg "\n", \ + bx, \ + by, \ + bz, \ + tx, \ + ty, \ + tz, \ + ##__VA_ARGS__); \ + } \ + } \ + } \ + } \ + } \ + } \ + } +#else +#define PRINT_B0_T0 +#define PRINT_TX_LX +#endif + +struct __string_view { + char const* data; + std::size_t size; +}; +#if __cplusplus >= 201402L +template +constexpr __string_view __get_type_name() { + char const* p = __PRETTY_FUNCTION__; + while (*p++ != '=') + ; + for (; *p == ' '; ++p) + ; + char const* p2 = p; + int count = 1; + for (;; ++p2) { + switch (*p2) { + case '[': + ++count; + break; + case ']': + --count; + if (!count) + return {p, std::size_t(p2 - p)}; + } + } + return {}; +} +#else +template +constexpr __string_view __get_type_name() { + return {"unsupported", 11}; +} +#endif + +// Print a given array +#define PRINT_ACCUM8_T0_L0_START(name, accum, start) \ + PRINT_B0_T0( \ + "%s[%d:%d] - {%f, %f, %f, %f, %f, %f, %f, %f}", \ + name, \ + int(start), \ + int(start + 8), \ + float(accum[start + 0]), \ + float(accum[start + 1]), \ + float(accum[start + 2]), \ + float(accum[start + 3]), \ + float(accum[start + 4]), \ + float(accum[start + 5]), \ + float(accum[start + 6]), \ + float(accum[start + 7])); +#define PRINT_ACCUM8_T0_L0(name, accum) PRINT_ACCUM8_T0_L0_START(name, accum, 0) +#define PRINT_FRAG_T0_L0(name, frag) \ + { \ + auto typeStr = __get_type_name(); \ + PRINT_B0_T0("printing %s (%s)", name, typeStr.data); \ + for (int _start = 0; _start < frag.size(); _start += 8) { \ + PRINT_ACCUM8_T0_L0_START(" ", frag, _start); \ + } \ + /*__syncthreads(); \ + NANCHECK(frag); */ \ + } +#define PRINT_ARRAY_T0_L0_INCR(name, array, length, incr) \ + { \ + PRINT_B0_T0("printing %s (len=%d)", name, int(length)); \ + for (int _start = 0; _start < length; _start += incr) { \ + PRINT_ACCUM8_T0_L0_START(" ", array, _start); \ + } \ + } +#define PRINT_ARRAY_T0_L0(name, array, length) \ + PRINT_ARRAY_T0_L0_INCR(name, array, length, 8) + +// Print a 4x4 matrix +#define PRINT_TENSOR4x4_T0_L0_START(name, ref, start_x, start_y) \ + PRINT_B0_T0( \ + "%s[%d:%d, %d:%d]:\n %f, %f, %f, %f\n %f, %f, %f, %f\n %f, %f, %f, %f\n %f, %f, %f, %f", \ + name, \ + int(start_x), \ + int(start_x + 4), \ + int(start_y), \ + int(start_y + 4), \ + float(ref.at({start_x + 0, start_y + 0})), \ + float(ref.at({start_x + 0, start_y + 1})), \ + float(ref.at({start_x + 0, start_y + 2})), \ + float(ref.at({start_x + 0, start_y + 3})), \ + float(ref.at({start_x + 1, start_y + 0})), \ + float(ref.at({start_x + 1, start_y + 1})), \ + float(ref.at({start_x + 1, start_y + 2})), \ + float(ref.at({start_x + 1, start_y + 3})), \ + float(ref.at({start_x + 2, start_y + 0})), \ + float(ref.at({start_x + 2, start_y + 1})), \ + float(ref.at({start_x + 2, start_y + 2})), \ + float(ref.at({start_x + 2, start_y + 3})), \ + float(ref.at({start_x + 3, start_y + 0})), \ + float(ref.at({start_x + 3, start_y + 1})), \ + float(ref.at({start_x + 3, start_y + 2})), \ + float(ref.at({start_x + 3, start_y + 3}))); +#define PRINT_TENSOR4x4_T0_L0(name, ref) \ + PRINT_TENSOR4x4_T0_L0_START(name, ref, 0, 0) + +#define PRINT_PROBLEM_SIZE(name, ps) \ + PRINT_B0_T0( \ + "%s.problem_size: {.m=%d, .n=%d, .k=%d}", \ + name, \ + int(ps.m()), \ + int(ps.n()), \ + int(ps.k())) + +template +CUTLASS_DEVICE void print_warp_accum( + AccumT accum, + LaneOffsetT lane_offset, + int32_t num_rows, + int32_t num_cols) { + bool is_main = blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && + threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0; + for (int row = 0; row < num_rows; ++row) { + for (int col = 0; col < num_cols; ++col) { + if (col % 32 == 0) { + if (is_main) { + printf("\nmat[%3d, %3d:%3d]", row, col, col + 32); + } + __syncthreads(); + } + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (row == accum_m && col == accum_n && + (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0)) { + printf(" %6.1f", float(accum[idx])); + } + }, + [&](int accum_m) {}); + __syncthreads(); + } + if (is_main) { + printf("\n"); + } + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_pipelined.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_pipelined.h new file mode 100644 index 0000000000000000000000000000000000000000..eaf7836eb027d541a23224ecc68d7cb39e9fb33b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_pipelined.h @@ -0,0 +1,636 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. + + File copied from + then modified to: + (1) load 2 source fragments at the same time (pipelining) + (2) support reading from a different dtype + (3) pass the row id to the OutputOp if it takes it + (see MemoryEfficientAttentionNormalize) + Note that in general the fragment passed to the OutputOp could + span multiple rows but it does not happen with the configurations we have +*/ + +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace threadblock { + +template +struct ApplyEpilogueOp { + static CUTLASS_DEVICE typename Op::FragmentOutput apply( + Op const& output_op, + int row_id, + typename Op::FragmentAccumulator const& accum, + typename Op::FragmentOutput const& source) { + return output_op(accum, source); + } + static CUTLASS_DEVICE typename Op::FragmentOutput apply( + Op const& output_op, + int row_id, + typename Op::FragmentAccumulator const& accum) { + return output_op(accum); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Epilogue operator +template < + typename Shape_, ///< Shape of threadblock tile (concept: GemmShape) + typename WarpMmaOperator_, ///< Warp-level MMA operator (concept: + ///< gemm::warp::MmaTensorOp) + int PartitionsK, ///< Number of partitions of the K dimension + typename OutputTileIterator_, ///< Tile iterator writing output tensors + typename AccumulatorFragmentIterator_, ///< Fragment iterator selecting + ///< accumulators + typename WarpTileIterator_, ///< Warp-scoped tile iterator writing + ///< accumulators to SMEM + typename SharedLoadIterator_, ///< Threadblock-scoped tile iterator loading + ///< from SMEM + typename OutputOp_, ///< Output operator + typename Padding_, ///< Padding added to SMEM allocation to avoid bank + ///< conflicts (concept: MatrixShape) + int FragmentsPerPartition = + 1, ///< Used to coarsten the epilogue granularity + int IterationsUnroll = ///< Used to reduce binary size when epilogue op is + ///< large + (!IsEpilogueFunctorHeavy::value), + typename OutputTileSourceIterator_ = + OutputTileIterator_ ///< Tile iterator reading tensors + > +class EpiloguePipelined : public EpilogueBase< + Shape_, + typename WarpMmaOperator_::Shape, + PartitionsK, + AccumulatorFragmentIterator_, + WarpTileIterator_, + Padding_, + FragmentsPerPartition> { + public: + using Base = EpilogueBase< + Shape_, + typename WarpMmaOperator_::Shape, + PartitionsK, + AccumulatorFragmentIterator_, + WarpTileIterator_, + Padding_, + FragmentsPerPartition>; + + using Shape = Shape_; + using WarpMmaOperator = WarpMmaOperator_; + static int const kPartitionsK = PartitionsK; + using OutputTileIterator = OutputTileIterator_; + using OutputTileSourceIterator = OutputTileSourceIterator_; + using AccumulatorFragmentIterator = AccumulatorFragmentIterator_; + using WarpTileIterator = WarpTileIterator_; + using SharedLoadIterator = SharedLoadIterator_; + using OutputOp = OutputOp_; + using Padding = Padding_; + + using Layout = layout::RowMajor; + using LongIndex = typename Layout::LongIndex; + + /// The complete warp-level accumulator tile + using AccumulatorTile = typename Base::AccumulatorTile; + + /// Accumulator element + using ElementAccumulator = typename WarpTileIterator::Element; + + /// Output element + using ElementOutput = typename OutputTileIterator::Element; + using ElementSource = typename OutputTileSourceIterator::Element; + + /// Output access size + static int const kElementsPerAccess = OutputTileIterator::kElementsPerAccess; + + /// Tensor reference to destination tensor + using TensorRef = typename OutputTileIterator::TensorRef; + + /// Tensor reference to sync tensor + using SyncTensorRef = + typename cutlass::TensorRef; + + /// Const tensor reference to source tensor + using ConstTensorRef = typename OutputTileIterator::ConstTensorRef; + + /// Array type used to output + using OutputAccessType = Array< + typename OutputTileIterator::Element, + OutputTileIterator::kElementsPerAccess>; + using SourceAccessType = Array< + typename OutputTileSourceIterator::Element, + OutputTileSourceIterator::kElementsPerAccess>; + + /// Array type used by output functor + using AccumulatorAccessType = Array< + typename WarpTileIterator::Element, + OutputTileIterator::kElementsPerAccess>; + + /// Number of warps + using WarpCount = typename Base::WarpCount; + + static int constexpr kSmemTiles = Base::kFragmentsPerIteration > 1 + ? Base::kFragmentsPerIteration + : kPartitionsK; + static int constexpr kSmemPointerOffset = + Base::SharedStorage::StorageShape::kCount / kSmemTiles; + + public: + static_assert( + OutputTileSourceIterator::Fragment::kElements == + OutputTileIterator::Fragment::kElements, + "Mismatch between input tile and output tile iterator (kElements)"); + static_assert( + OutputTileSourceIterator::kIterations == OutputTileIterator::kIterations, + "Mismatch between input tile and output tile iterator (kIterations)"); + static_assert( + SharedLoadIterator::Fragment::kElements == + OutputTileIterator::Fragment::kElements, + "Mismatch between shared load iterator and output tile iterator."); + + static_assert( + OutputTileIterator::kElementsPerAccess, + "OutputTileIterator::kElementsPerAccess must not be zero."); + + static_assert( + !(OutputTileIterator::Fragment::kElements % + OutputTileIterator::kElementsPerAccess), + "Divisibility"); + + private: + /// Loads fragment from shared memory aligned with output tensor + SharedLoadIterator shared_load_iterator_; + + public: + /// Constructor + CUTLASS_DEVICE + EpiloguePipelined( + typename Base::SharedStorage& shared_storage, ///< Shared storage object + int thread_idx, ///< ID of a thread within the threadblock + int warp_idx, ///< ID of warp within threadblock + int lane_idx ///< Id of thread within warp + ) + : Base(shared_storage, thread_idx, warp_idx, lane_idx), + shared_load_iterator_(shared_storage.reference(), thread_idx) {} + + /// Streams the result to global memory + CUTLASS_DEVICE + void operator()( + OutputOp const& output_op, ///< Output operator + OutputTileIterator + destination_iterator, ///< Tile iterator for destination + AccumulatorTile const& + accumulators, ///< Complete warp-level accumulator tile + OutputTileSourceIterator + source_iterator) { ///< Threadblock tile coordinate in GEMM (in units + ///< of threadblock tiles) + + if (!output_op.is_source_needed()) { + compute_source_not_needed_(output_op, destination_iterator, accumulators); + } else { + compute_source_needed_( + output_op, destination_iterator, accumulators, source_iterator); + } + } + CUTLASS_DEVICE + void operator()( + OutputOp const& output_op, ///< Output operator + OutputTileIterator + destination_iterator, ///< Tile iterator for destination + AccumulatorTile const& + accumulators) { ///< Complete warp-level accumulator tile + compute_source_not_needed_(output_op, destination_iterator, accumulators); + } + + private: + template + struct acc2smem_source_not_needed; + + template + struct acc2smem_source_not_needed> { + template + CUTLASS_DEVICE static void helper( + AccumulatorFragmentIterator accum_fragment_iterator, + WarpTileIterator& warp_tile_iterator) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Advance; i++) { + ++accum_fragment_iterator; + } + + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < Base::kFragmentsPerIteration; ++p) { + typename AccumulatorFragmentIterator::Fragment accum_fragment; + + accum_fragment_iterator.load(accum_fragment); + ++accum_fragment_iterator; + + warp_tile_iterator.store(accum_fragment); + if (p < Base::kFragmentsPerIteration - 1) { + warp_tile_iterator.add_pointer_offset(kSmemPointerOffset); + } + } + + if (Base::kFragmentsPerIteration > 1) { + warp_tile_iterator.add_pointer_offset( + kSmemPointerOffset * (1 - Base::kFragmentsPerIteration)); + } + } + + CUTLASS_DEVICE + static void push( + size_t pos, + AccumulatorFragmentIterator const& iterator_begin, + WarpTileIterator& warp_tile_iterator) { + int dummy[] = { + (pos == (Seq * Base::kFragmentsPerIteration)) && + (helper( + iterator_begin, warp_tile_iterator), + 0)...}; + + CUTLASS_UNUSED(dummy[0]); + } + }; + + static_assert( + kPartitionsK == 1 || Base::kFragmentsPerIteration == 1, + "One of these must be exactly 1."); + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_not_needed_( + OutputOp const& output_op, ///< Output operator + OutputTileIterator + destination_iterator, ///< Tile iterator for destination + AccumulatorTile const& + accumulators ///< Complete warp-level accumulator tile + ) { + // + // Iterator over warp-level accumulator fragment + // + + AccumulatorFragmentIterator accum_fragment_iterator(accumulators); + + // + // Iterate over accumulator tile + // + +#pragma unroll( \ + IterationsUnroll \ + ? OutputTileIterator::kIterations / Base::kFragmentsPerIteration \ + : 1) + for (int iter = 0; iter < OutputTileIterator::kIterations; + iter += Base::kFragmentsPerIteration) { + // + // Convert and store fragment + // + + __syncthreads(); + + acc2smem_source_not_needed>:: + push(iter, accum_fragment_iterator, this->warp_tile_iterator_); + + __syncthreads(); + + // + // Load fragments from shared memory + // + + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < Base::kFragmentsPerIteration; ++p) { + typename SharedLoadIterator::Fragment + aligned_accum_fragment[kPartitionsK]; + + shared_load_iterator_.load(aligned_accum_fragment[0]); + + if (p < Base::kFragmentsPerIteration - 1) { + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset); + } else if (kPartitionsK > 1) { + plus add_fragments; + + CUTLASS_PRAGMA_UNROLL + for (int i = 1; i < kPartitionsK; ++i) { + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset); + shared_load_iterator_.load(aligned_accum_fragment[i]); + aligned_accum_fragment[0] = add_fragments( + aligned_accum_fragment[0], aligned_accum_fragment[i]); + } + + shared_load_iterator_.add_pointer_offset( + (1 - kPartitionsK) * kSmemPointerOffset); + } + + // + // Compute the output result + // + + typename OutputTileIterator::Fragment output_fragment; + + apply_output_operator_source_not_needed_( + destination_iterator.thread_start_row(), + output_fragment, + output_op, + aligned_accum_fragment[0]); + + // + // Store the final result + // + + destination_iterator.store(output_fragment); + ++destination_iterator; + } + + if (Base::kFragmentsPerIteration > 1) { + shared_load_iterator_.add_pointer_offset( + kSmemPointerOffset * (1 - Base::kFragmentsPerIteration)); + } + } + } + + template + struct acc2smem_source_needed; + + template + struct acc2smem_source_needed> { + template + CUTLASS_DEVICE static void helper( + AccumulatorFragmentIterator accum_fragment_iterator, + WarpTileIterator& warp_tile_iterator) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Advance; i++) { + ++accum_fragment_iterator; + } + + typename AccumulatorFragmentIterator::Fragment accum_fragment; + accum_fragment_iterator.load(accum_fragment); + warp_tile_iterator.store(accum_fragment); + } + + CUTLASS_DEVICE + static void push( + size_t pos, + AccumulatorFragmentIterator const& iterator_begin, + WarpTileIterator& warp_tile_iterator) { + int dummy[] = { + (pos == Seq) && + (helper(iterator_begin, warp_tile_iterator), 0)...}; + } + }; + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_needed_( + OutputOp const& output_op, ///< Output operator + OutputTileIterator + destination_iterator, ///< Tile iterator for destination + AccumulatorTile const& + accumulators, ///< Complete warp-level accumulator tile + OutputTileSourceIterator + source_iterator ///< Threadblock tile coordinate in GEMM (in units of + ///< threadblock tiles) + ) { + typename OutputTileSourceIterator::Fragment source_fragment[2]; + + source_fragment[0].clear(); + source_iterator.load(source_fragment[0]); + ++source_iterator; + source_fragment[1].clear(); + + // + // Iterator over warp-level accumulator fragment + // + + AccumulatorFragmentIterator accum_fragment_iterator(accumulators); + + // + // Iterate over accumulator tile + // + +#pragma unroll(IterationsUnroll ? OutputTileIterator::kIterations : 1) + for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) { + if (iter > 0) { + __syncthreads(); + } + // + // Load the source for next iteration (pipelining) + // + + if (iter + 1 < OutputTileIterator::kIterations) { + source_iterator.load(source_fragment[(iter + 1) % 2]); + } + ++source_iterator; + acc2smem_source_needed< + cutlass::make_index_sequence>:: + push(iter, accum_fragment_iterator, this->warp_tile_iterator_); + + __syncthreads(); + + // + // Load fragments from shared memory + // + + typename SharedLoadIterator::Fragment + aligned_accum_fragment[kPartitionsK]; + + shared_load_iterator_.load(aligned_accum_fragment[0]); + + // If the number of k-slices is > 1 - perform a reduction amongst the + // k-slices + if (kPartitionsK > 1) { + plus add_fragments; + + CUTLASS_PRAGMA_UNROLL + for (int i = 1; i < kPartitionsK; ++i) { + shared_load_iterator_.add_pointer_offset(kSmemPointerOffset); + shared_load_iterator_.load(aligned_accum_fragment[i]); + aligned_accum_fragment[0] = add_fragments( + aligned_accum_fragment[0], aligned_accum_fragment[i]); + } + + shared_load_iterator_.add_pointer_offset( + (1 - kPartitionsK) * kSmemPointerOffset); + } + + // + // Compute the output result + // + + typename OutputTileIterator::Fragment output_fragment; + + apply_output_operator_( + destination_iterator.thread_start_row(), + output_fragment, + output_op, + aligned_accum_fragment[0], + source_fragment[iter % 2]); + + // + // Store the final result + // + + destination_iterator.store(output_fragment); + ++destination_iterator; + } + } + + /// Helper to invoke the output functor over each vector of output + CUTLASS_DEVICE + void apply_output_operator_( + int begin_row, + typename OutputTileIterator::Fragment& output_fragment, + OutputOp const& output_op, ///< Output operator + typename SharedLoadIterator::Fragment const& aligned_accum_fragment, + typename OutputTileSourceIterator::Fragment const& source_fragment) { + OutputAccessType* output_frag_ptr = + reinterpret_cast(&output_fragment); + + AccumulatorAccessType const* compute_frag_ptr = + reinterpret_cast(&aligned_accum_fragment); + + SourceAccessType const* source_frag_ptr = + reinterpret_cast(&source_fragment); + + int const kOutputOpIterations = OutputTileIterator::Fragment::kElements / + OutputTileIterator::kElementsPerAccess; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kOutputOpIterations; ++i) { + // Call the output operator + output_frag_ptr[i] = ApplyEpilogueOp::apply( + output_op, + begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess), + compute_frag_ptr[i], + source_frag_ptr[i]); + } + } + + /// Helper to invoke the output functor over each vector of output + CUTLASS_DEVICE + void apply_output_operator_source_not_needed_( + int begin_row, + typename OutputTileIterator::Fragment& output_fragment, + OutputOp const& output_op, ///< Output operator + typename SharedLoadIterator::Fragment const& aligned_accum_fragment) { + OutputAccessType* output_frag_ptr = + reinterpret_cast(&output_fragment); + + AccumulatorAccessType const* compute_frag_ptr = + reinterpret_cast(&aligned_accum_fragment); + + int const kOutputOpIterations = OutputTileIterator::Fragment::kElements / + OutputTileIterator::kElementsPerAccess; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kOutputOpIterations; ++i) { + // Call the output operator + output_frag_ptr[i] = ApplyEpilogueOp::apply( + output_op, + begin_row + getRowOffset(i * OutputTileIterator::kElementsPerAccess), + compute_frag_ptr[i]); + } + } + + constexpr int CUTLASS_HOST_DEVICE getRowOffset(int i) { + using ThreadMap = typename OutputTileIterator::ThreadMap; + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; + ++cluster) { + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + int row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; + int frag_row_idx = + (row + + ThreadMap::Iterations::kRow * + (group + ThreadMap::Iterations::kGroup * cluster)); + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; + ++column) { + int frag_idx = ThreadMap::kElementsPerAccess * + (frag_row_idx * ThreadMap::Iterations::kColumn + column); + if (i < frag_idx + ThreadMap::kElementsPerAccess) { + return row_offset; + } + } + } + } + } + return -1; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h new file mode 100644 index 0000000000000000000000000000000000000000..44a8adde27ca17333f731f9de0316f9bcc6aa3dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_rescale_output.h @@ -0,0 +1,243 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*! \file + \brief Epilogue for threadblock scoped GEMMs using Tensor Ops. + + The epilogue rearranges the result of a matrix product through shared memory + to match canonical tensor layouts in global memory. Epilogues support + conversion and reduction operations. + + This is a copy of cutlass/epilogue/threadblock/epilogue.h that can + handle "row_id" as a first argument, as uses it to get the corresponding + `m_prime` / `s_prime` to rescale the output. +*/ + +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies a linear combination operator to an array of elements. +// output <- alpha * accumulator + beta * source +// with: +// alpha = 1 / s_prime (to normalize when isLast=True, 1 otherwise) +// beta = alpha / m_prime (renormalize the output when the max changes) +// source is the current output +template < + typename ElementOutput_, ///< Data type used to store tensors + typename ElementSource_, //< Data type for source (usually matches + //`ElementOutput`) + int Count, ///< Number of elements computed per operation. + ///< Usually it is 128/sizeof_bits, + ///< but we use 64 or 32 sometimes when there are not enough data + ///< to store + typename ElementAccumulator_, ///< Accumulator data type + typename ElementCompute_, ///< Data type used to compute linear combination + bool isFirst, + bool isLast, + typename FragmentAlphaBeta_, + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest> +class MemoryEfficientAttentionNormalize { + public: + using ElementOutput = ElementOutput_; + using ElementSource = ElementSource_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + + static int const kCount = Count; + + using FragmentOutput = Array; + using FragmentSource = Array; + using FragmentAccumulator = Array; + using ComputeFragment = Array; + using FragmentAlphaBeta = FragmentAlphaBeta_; + + static FloatRoundStyle const kRound = Round; + + private: + // + // Data members + // + + FragmentAlphaBeta const& s_prime_; + FragmentAlphaBeta const& m_prime_; + + public: + /// Constructs the function object, possibly loading from pointers in host + /// memory + CUTLASS_HOST_DEVICE + MemoryEfficientAttentionNormalize( + FragmentAlphaBeta const& s_prime, + FragmentAlphaBeta const& m_prime) + : s_prime_(s_prime), m_prime_(m_prime) {} + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + return !isFirst; + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition, int k_partition_count) {} + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + int row, + FragmentAccumulator const& accumulator, + FragmentSource const& source) const { + assert(!isFirst); + + // Convert source to internal compute numeric type + NumericArrayConverter + source_converter; + NumericArrayConverter + accumulator_converter; + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + ComputeFragment converted_source = source_converter(source); + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies mul_add_source; + multiply_add mul_add_accumulator; + + // Row sums for full masked out rows are 0, we set them to 1 + // In order to avoid NaNs in the output and instead sem them to 0. + ElementCompute denom = s_prime_[row] == 0 ? 1 : s_prime_[row]; + ElementCompute alpha = isLast ? (1 / denom) : 1; + ElementCompute beta = alpha * m_prime_[row]; + + intermediate = mul_add_source(beta, converted_source); // X = beta * C + + intermediate = mul_add_accumulator( + alpha, converted_accumulator, intermediate); // D = alpha * Accum + X + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()(int row, FragmentAccumulator const& accumulator) + const { + assert(isFirst); + + // Convert source to internal compute numeric type + NumericArrayConverter + accumulator_converter; + + // Convert to destination numeric type + NumericArrayConverter + destination_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + ComputeFragment intermediate; + multiplies mul_accumulator; + + // Row sums for full masked out rows are 0, we set them to 1 + // In order to avoid NaNs in the output and instead sem them to 0. + ElementCompute denom = s_prime_[row] == 0 ? 1 : s_prime_[row]; + ElementCompute alpha = isLast ? (1 / denom) : 1; + + intermediate = mul_accumulator( + alpha, converted_accumulator); // X = alpha * C + uniform + + return destination_converter(intermediate); + } +}; + +} // namespace thread + +namespace threadblock { +template < + typename EO, + typename ES, + int Count, + typename EA, + typename EC, + bool F, + bool L, + typename FAB, + FloatRoundStyle R> +struct ApplyEpilogueOp> { + using Op = thread:: + MemoryEfficientAttentionNormalize; + static CUTLASS_DEVICE typename Op::FragmentOutput apply( + Op const& output_op, + int row_id, + typename Op::FragmentAccumulator const& accum, + typename Op::FragmentSource const& source) { + return output_op(row_id, accum, source); + } + static CUTLASS_DEVICE typename Op::FragmentOutput apply( + Op const& output_op, + int row_id, + typename Op::FragmentAccumulator const& accum) { + return output_op(row_id, accum); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h new file mode 100644 index 0000000000000000000000000000000000000000..2f5bf2957dad1f41639111a093b52b841842045d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/epilogue/epilogue_thread_apply_logsumexp.h @@ -0,0 +1,180 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template +struct ArrayExponential { + CUTLASS_HOST_DEVICE + Array operator()( + Array const& input) const { + Array result; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < ElementsPerAccess; ++i) { + result[i] = expf(input[i]); + } + + return result; + } +}; + +template +struct ArrayExponential { + CUTLASS_DEVICE + Array operator()( + Array const& input) const { + Array result; + + int const kVectorCount = ElementsPerAccess / 2; + + __half2 const* input_ptr = + reinterpret_cast<__half2 const*>(input.raw_data()); + __half2* res_ptr = reinterpret_cast<__half2*>(result.raw_data()); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kVectorCount; ++i) { + res_ptr[i] = h2exp(input_ptr[i]); + } + + return result; + } +}; +} // namespace detail + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies: +/// output <- (input - lse).exp() +template < + typename ElementOutput_, // output + typename ElementLSE_, // accumulator from LSE + typename ElementAccumulator_, // accumulator from matmul + typename ElementCompute_, // intermediate compute (and exp calculation) + int ElementsPerAccess> +class ApplyLogSumExp { + public: + using ElementOutput = ElementOutput_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + using ElementLSE = ElementLSE_; + + static int constexpr kElementsPerAccess = ElementsPerAccess; + static int constexpr kCount = kElementsPerAccess; + static constexpr ScaleType::Kind kScale = + cutlass::epilogue::thread::ScaleType::NoBetaScaling; + + using FragmentOutput = Array; + using FragmentAccumulator = Array; + using FragmentCompute = Array; + using FragmentLSE = Array; + using FragmentScaleBias = FragmentLSE; // Used by epilogue_smem_accumulator.h + + public: + // + // Methods + // + + CUTLASS_HOST_DEVICE + ApplyLogSumExp() {} + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + return true; + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition, int k_partition_count) {} + + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const& AB, + FragmentLSE const& scale_unused, + // bias used as LSE + FragmentLSE const& bias) const { + FragmentCompute frag_AB = NumericArrayConverter< + ElementCompute, + ElementAccumulator, + kElementsPerAccess>()(AB); + FragmentCompute frag_lse_compute = + NumericArrayConverter()( + bias); + FragmentCompute frag_compute; + + minus minus_lse; + detail::ArrayExponential apply_exp; + frag_compute = minus_lse(frag_AB, frag_lse_compute); + frag_compute = apply_exp(frag_compute); + + return NumericArrayConverter< + ElementOutput, + ElementCompute, + kElementsPerAccess>()(frag_compute); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h new file mode 100644 index 0000000000000000000000000000000000000000..3da0bfc9d2e5634dbf97c85f91d13423e5962036 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma.h @@ -0,0 +1,105 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include + +#include +#include +template +struct MakeCustomMma; + +template < + typename Shape, + typename IteratorA, + typename SmemIteratorA, + cutlass::arch::CacheOperation::Kind CacheOpA, + typename IteratorB, + typename SmemIteratorB, + cutlass::arch::CacheOperation::Kind CacheOpB, + typename ElementC, + typename LayoutC, + typename Policy, + int Stages, + cutlass::gemm::SharedMemoryClearOption SharedMemoryClear, + int kMaxK> +struct MakeCustomMma< + cutlass::gemm::threadblock::MmaMultistage< + Shape, + IteratorA, + SmemIteratorA, + CacheOpA, + IteratorB, + SmemIteratorB, + CacheOpB, + ElementC, + LayoutC, + Policy, + Stages, + SharedMemoryClear>, + kMaxK> { + // Reduce the number of stages if we don't need that many + static int constexpr kStages = + kMaxK == cutlass::platform::numeric_limits::max() + ? Stages + : cutlass::const_min( + Stages, + (kMaxK + int(Shape::kK) - 1) / int(Shape::kK)); + using Mma = cutlass::gemm::threadblock::CustomMmaMultistage< + Shape, + IteratorA, + SmemIteratorA, + CacheOpA, + IteratorB, + SmemIteratorB, + CacheOpB, + ElementC, + LayoutC, + Policy, + kStages, + SharedMemoryClear, + kMaxK>; +}; + +template < + typename Shape, + typename IteratorA, + typename SmemIteratorA, + typename IteratorB, + typename SmemIteratorB, + typename ElementC, + typename LayoutC, + typename Policy, + int kMaxK> +struct MakeCustomMma< + cutlass::gemm::threadblock::MmaPipelined< + Shape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + Policy>, + kMaxK> { + using Mma = cutlass::gemm::threadblock::CustomMmaPipelined< + Shape, + IteratorA, + SmemIteratorA, + IteratorB, + SmemIteratorB, + ElementC, + LayoutC, + Policy>; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h new file mode 100644 index 0000000000000000000000000000000000000000..d0ef20a972efe604f0ff2c8bc47c810b2d0ced34 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_base.h @@ -0,0 +1,188 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class CustomMmaBase { + public: + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + + ///< Policy describing tuning details + using Policy = Policy_; + + // + // Dependent types + // + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Shape describing the overall GEMM computed from shared memory + /// by each warp. + using WarpGemm = typename Policy::Operator::Shape; + + /// Shape describing the number of warps filling the CTA + using WarpCount = GemmShape< + Shape::kM / WarpGemm::kM, + Shape::kN / WarpGemm::kN, + Shape::kK / WarpGemm::kK>; + + /// Number of warp-level GEMM operations + static int const kWarpGemmIterations = + (WarpGemm::kK / Operator::Policy::MmaShape::kK); + + /// Number of stages + static int const kStages = Stages; + + // + // Nested structs + // + + /// Shared storage object needed by threadblock-scoped GEMM + template + struct OperandSharedStorage { + AlignedBuffer buffer; + using TensorRef = TensorRef; + + CUTLASS_DEVICE + static OperandLayout Layout() { + return OperandLayout::packed({OperandShape::kRow, OperandShape::kColumn}); + } + + /// Returns a TensorRef to the operand + CUTLASS_HOST_DEVICE + TensorRef ref() { + return TensorRef{buffer.data(), Layout()}; + } + }; + + /// Shape of the A matrix operand in shared memory + using ShapeA = MatrixShape< + Shape::kM + Policy::SmemPaddingA::kRow, + Shape::kK * kStages + Policy::SmemPaddingA::kColumn>; + + /// Shape of the B matrix operand in shared memory + using ShapeB = MatrixShape< + Shape::kK * kStages + Policy::SmemPaddingB::kRow, + Shape::kN + Policy::SmemPaddingB::kColumn>; + + using SharedStorageA = OperandSharedStorage< + typename Operator::ElementA, + ShapeA, + typename Operator::LayoutA>; + using SharedStorageB = OperandSharedStorage< + typename Operator::ElementB, + ShapeB, + typename Operator::LayoutB>; + using TensorRefA = typename SharedStorageA::TensorRef; + using TensorRefB = typename SharedStorageB::TensorRef; + + struct SharedStorage { + /// Buffer for A operand + SharedStorageA operand_A; + + /// Buffer for B operand + SharedStorageB operand_B; + }; + + protected: + // + // Data members + // + + /// Iterator to load a warp-scoped tile of A operand from shared memory + typename Operator::IteratorA warp_tile_iterator_A_; + + /// Iterator to load a warp-scoped tile of B operand from shared memory + typename Operator::IteratorB warp_tile_iterator_B_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + CustomMmaBase( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + SharedStorageA& shared_storageA, + SharedStorageB& shared_storageB, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : warp_tile_iterator_A_(shared_storageA.ref(), lane_idx), + warp_tile_iterator_B_(shared_storageB.ref(), lane_idx) {} +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h new file mode 100644 index 0000000000000000000000000000000000000000..137624bdfb00d9320ec9787400051f34c1fe5180 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_multistage.h @@ -0,0 +1,773 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Use zfill or predicate for out-of-bound cp.async + SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone, + /// Upper boundon the K dimension + int kMaxK = cutlass::platform::numeric_limits::max(), + /// Used for partial specialization + typename Enable = bool> +class CustomMmaMultistage : public CustomMmaBase { + public: + ///< Base class + using Base = CustomMmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + ///< Policy describing tuning details + using Policy = Policy_; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = arch::Sm80; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + static_assert( + Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA = + (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / + Base::kWarpGemmIterations; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB = + (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / + Base::kWarpGemmIterations; + }; + + static bool const kSmemContainsEntireMat = kMaxK <= Shape::kK * Stages; + static constexpr int kNumStagesConcurrentLoad = + kSmemContainsEntireMat ? Stages : Stages - 1; + + private: + using WarpLoadedFragmentA = typename Operator::FragmentA; + using WarpLoadedFragmentB = typename Operator::FragmentB; + using WarpTransformedFragmentA = typename Operator::TransformedFragmentA; + using WarpTransformedFragmentB = typename Operator::TransformedFragmentB; + + private: + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + bool prologue_done_; + + // Set to `True` to ensure the accumulator will be zero outside the GEMM + // footprint + bool zero_outside_bounds_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + CustomMmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storageA.ref(), thread_idx), + smem_iterator_B_(shared_storageB.ref(), thread_idx), + prologue_done_(false), + zero_outside_bounds_(false) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + CUTLASS_DEVICE + CustomMmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage& st, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : CustomMmaMultistage( + st.operand_A, + st.operand_B, + thread_idx, + warp_idx, + lane_idx) {} + + CUTLASS_DEVICE + void set_prologue_done(bool value) { + prologue_done_ = value; + } + + CUTLASS_DEVICE + void set_zero_outside_bounds(bool value) { + zero_outside_bounds_ = value; + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorage& shared_storage, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + prologue( + shared_storage.operand_A, + shared_storage.operand_B, + iterator_A, + iterator_B, + thread_idx, + problem_size_k); + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + SmemIteratorA smem_iterator_A(shared_storageA.ref(), thread_idx); + SmemIteratorB smem_iterator_B(shared_storageB.ref(), thread_idx); + int32_t iter = (problem_size_k + Base::Shape::kK - 1) / Base::Shape::kK; + _prologue( + iterator_A, iterator_B, iter, smem_iterator_A, smem_iterator_B); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance( + IteratorA& iterator_A, + IteratorB& iterator_B, + int group_start_A = 0, + int group_start_B = 0) { + iterator_A.set_iteration_index( + group_start_A * IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) { + typename IteratorA::AccessType* dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_A.get(); + + if (zero_outside_bounds_ || + SharedMemoryClear == SharedMemoryClearOption::kZfill) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + } else { + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + } + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + } + + iterator_B.set_iteration_index( + group_start_B * IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) { + typename IteratorB::AccessType* dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B.get(); + + if (zero_outside_bounds_ || + SharedMemoryClear == SharedMemoryClearOption::kZfill) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + } else { + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + } + + ++iterator_B; + } + ++this->smem_iterator_B_; + } + } + } + + template + CUTLASS_DEVICE static void _prologue( + IteratorA& iterator_A, + IteratorB& iterator_B, + int32_t& gemm_k_iterations, + SmemIteratorA& smem_iterator_A_, + SmemIteratorB& smem_iterator_B_) { + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < kNumStagesConcurrentLoad; + ++stage, --gemm_k_iterations) { + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + + iterator_A.set_iteration_index(0); + smem_iterator_A_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType* dst_ptr = + reinterpret_cast( + smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + int src_bytes = (iterator_A.valid() ? kSrcBytes : 0); + + if (kLoadA) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_A.get(), iterator_A.valid()); + } + + ++iterator_A; + } + + ++smem_iterator_A_; + } + + iterator_B.set_iteration_index(0); + smem_iterator_B_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType* dst_ptr = + reinterpret_cast( + smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + if (kLoadB) { + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B.get(), iterator_B.valid()); + } + + ++iterator_B; + } + + ++smem_iterator_B_; + } + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + smem_iterator_A_.add_tile_offset({0, 1}); + smem_iterator_B_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC& accum, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + ///< initial value of accumulator + FragmentC const& src_accum) { + // + // Prologue + // + + if (!prologue_done_) { + _prologue( + iterator_A, + iterator_B, + gemm_k_iterations, + smem_iterator_A_, + smem_iterator_B_); + } else if (!kSmemContainsEntireMat) { + _prologue( + iterator_A, + iterator_B, + gemm_k_iterations, + smem_iterator_A_, + smem_iterator_B_); + } else { + gemm_k_iterations -= kNumStagesConcurrentLoad; + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // + // Clear the remaining tiles of SMEM. This is a functional requirement for + // some kernels so that all accumulator elements outside the GEMM footprint + // are zero. + // + + if (SharedMemoryClear == SharedMemoryClearOption::kClearLastStage) { + /// Iterator to write threadblock-scoped tile of A operand to shared + /// memory + SmemIteratorA last_smem_iterator_A(this->smem_iterator_A_); + + typename IteratorA::AccessType zero_A; + zero_A.clear(); + + last_smem_iterator_A.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType* dst_ptr = + reinterpret_cast( + last_smem_iterator_A.get()); + + *dst_ptr = zero_A; + + ++last_smem_iterator_A; + } + + /// Iterator to write threadblock-scoped tile of B operand to shared + /// memory + SmemIteratorB last_smem_iterator_B(this->smem_iterator_B_); + typename IteratorB::AccessType zero_B; + + zero_B.clear(); + last_smem_iterator_B.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType* dst_ptr = + reinterpret_cast( + last_smem_iterator_B.get()); + + *dst_ptr = zero_B; + + ++last_smem_iterator_B; + } + } + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA warp_loaded_frag_A[2]; + WarpLoadedFragmentB warp_loaded_frag_B[2]; + WarpTransformedFragmentA warp_transformed_frag_A[2]; + WarpTransformedFragmentB warp_transformed_frag_B[2]; + + Operator warp_mma; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma.transform( + warp_transformed_frag_A[0], + warp_transformed_frag_B[0], + warp_loaded_frag_A[0], + warp_loaded_frag_B[0]); + + // tf32x3 kernels use staging accumulation. warp_mma uses a temporary + // accumulator and this temporary accumulator is added to the final + // accumulator once in every mainloop iteration. + plus plus_accum; + + FragmentC tmp_accum; + + if (platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + tmp_accum.clear(); + } + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-kNumStagesConcurrentLoad);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + + // In case of a non-circular buffer ("kSmemContainsEntireMat") + // make sure we don't load out of bounds data. + if (!kSmemContainsEntireMat || + gemm_k_iterations > (-kNumStagesConcurrentLoad) || + warp_mma_k < Base::kWarpGemmIterations - 1) { + this->warp_tile_iterator_A_.load( + warp_loaded_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load( + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + } + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma.transform( + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + warp_loaded_frag_A[warp_mma_k % 2], + warp_loaded_frag_B[warp_mma_k % 2]); + + if (platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + warp_mma( + tmp_accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + tmp_accum); + + if (warp_mma_k == 0) { + accum = plus_accum(accum, tmp_accum); + tmp_accum.clear(); + } + } else { + warp_mma( + accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + accum); + } + + // Issue global->shared copies for the this stage + if (!kSmemContainsEntireMat && + warp_mma_k < Base::kWarpGemmIterations - 1) { + int group_start_iteration_A, group_start_iteration_B; + + group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA; + group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance( + iterator_A, + iterator_B, + group_start_iteration_A, + group_start_iteration_B); + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + if (!kSmemContainsEntireMat) { + int group_start_iteration_A, group_start_iteration_B; + group_start_iteration_A = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance( + iterator_A, + iterator_B, + group_start_iteration_A, + group_start_iteration_B); + } + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (!kSmemContainsEntireMat && + smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A_.add_tile_offset( + {0, + -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + iterator_A.clear_mask(gemm_k_iterations == 0); + iterator_B.clear_mask(gemm_k_iterations == 0); + } + + // Do any conversions feeding the first stage at the end of the loop so + // we can start right away on mma instructions + if (warp_mma_k + 1 == Base::kWarpGemmIterations) + warp_mma.transform( + warp_transformed_frag_A[(warp_mma_k + 1) % 2], + warp_transformed_frag_B[(warp_mma_k + 1) % 2], + warp_loaded_frag_A[(warp_mma_k + 1) % 2], + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + } + } + + if (platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + accum = plus_accum(accum, tmp_accum); + } + + if (SharedMemoryClear == SharedMemoryClearOption::kZfill) { + // commit and drain all pending and predicated cp.async pnz from the GEMM + // mainloop + cutlass::arch::cp_async_fence(); + cutlass::arch::cp_async_wait<0>(); + __syncthreads(); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h new file mode 100644 index 0000000000000000000000000000000000000000..a36d7cdb58242e9e43b8050cf9068ee11024aed8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/custom_mma_pipelined.h @@ -0,0 +1,407 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to A operand + typename TransformA_ = NumericArrayConverter< + typename SmemIteratorA_::Element, + typename IteratorA_::Element, + IteratorA_::Fragment::kElements>, + /// + /// Transformation applied to B operand + typename TransformB_ = NumericArrayConverter< + typename SmemIteratorB_::Element, + typename IteratorB_::Element, + IteratorB_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool> +class CustomMmaPipelined : public CustomMmaBase { + public: + ///< Base class + using Base = CustomMmaBase; + + using Shape = + Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using IteratorA = + IteratorA_; ///< Iterates over tiles of A operand in global memory + using IteratorB = + IteratorB_; ///< Iterates over tiles of B operand in global memory + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + using Policy = Policy_; ///< Policy describing tuning details + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + using TransformA = TransformA_; + using TransformB = TransformB_; + + // + // Dependent types + // + + /// Fragment of operand A loaded from global memory + using FragmentA = typename IteratorA::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB = typename IteratorB::Fragment; + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + // statically assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert( + (Base::kStages == 2), + "MmaPipelined requires kStages set to value 2"); + + static bool const kSmemContainsEntireMat = false; + + private: + using WarpFragmentA = typename Operator::FragmentA; + using WarpFragmentB = typename Operator::FragmentB; + + protected: + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + CustomMmaPipelined( + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx ///< ID of each thread within a warp + ) + : Base(shared_storageA, shared_storageB, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storageA.ref(), thread_idx), + smem_iterator_B_(shared_storageB.ref(), thread_idx) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + CUTLASS_DEVICE + CustomMmaPipelined( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage& st, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : CustomMmaPipelined( + st.operand_A, + st.operand_B, + thread_idx, + warp_idx, + lane_idx) {} + + CUTLASS_DEVICE + void set_prologue_done(bool value) { + // NOT IMPLEMENTED FOR PIPELINED + } + + CUTLASS_DEVICE + void set_zero_outside_bounds(bool value) { + // NOT NEEDED FOR PIPELINED + // shared memory will always be zero-filled + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorage& shared_storage, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + prologue( + shared_storage.operand_A, + shared_storage.operand_B, + iterator_A, + iterator_B, + thread_idx, + problem_size_k); + } + + template + CUTLASS_DEVICE static void prologue( + typename Base::SharedStorageA& shared_storageA, + typename Base::SharedStorageB& shared_storageB, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + int thread_idx, + int problem_size_k) { + // NOT IMPLEMENTED FOR PIPELINED + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations, ///< number of iterations of the mainloop + FragmentC& accum, ///< destination accumulator tile + IteratorA iterator_A, ///< iterator over A operand in global memory + IteratorB iterator_B, ///< iterator over B operand in global memory + FragmentC const& src_accum, ///< source accumulator tile + TransformA transform_A = + TransformA(), ///< transformation applied to A fragment + TransformB transform_B = + TransformB()) { ///< transformation applied to B fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + FragmentA tb_frag_A; + FragmentB tb_frag_B; + + tb_frag_A.clear(); + tb_frag_B.clear(); + + // The last kblock is loaded in the prolog + iterator_A.load(tb_frag_A); + iterator_B.load(tb_frag_B); + + ++iterator_A; + ++iterator_B; + + this->smem_iterator_A_.store(transform_A(tb_frag_A)); + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpFragmentA warp_frag_A[2]; + WarpFragmentB warp_frag_B[2]; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + Operator warp_mma; + + int smem_write_stage_idx = 1; + + // Avoid reading out of bounds + iterator_A.clear_mask(gemm_k_iterations <= 1); + iterator_B.clear_mask(gemm_k_iterations <= 1); + + // Issue loads during the first warp-level matrix multiply-add *AFTER* + // issuing shared memory loads (which have the tighest latency requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > 0; --gemm_k_iterations) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations - 1) { + // Write fragments to shared memory + this->smem_iterator_A_.store(transform_A(tb_frag_A)); + + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + __syncthreads(); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B_; + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + } else { + this->warp_tile_iterator_A_.add_tile_offset( + {0, + -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_A_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k == 0) { + iterator_A.load(tb_frag_A); + iterator_B.load(tb_frag_B); + + ++iterator_A; + ++iterator_B; + + // Avoid reading out of bounds if this was the last loop iteration + iterator_A.clear_mask(gemm_k_iterations <= 2); + iterator_B.clear_mask(gemm_k_iterations <= 2); + } + + warp_mma( + accum, + warp_frag_A[warp_mma_k % 2], + warp_frag_B[warp_mma_k % 2], + accum); + } + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h new file mode 100644 index 0000000000000000000000000000000000000000..e7408deec401b691837bf4f063ae71da79093f11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/find_default_mma.h @@ -0,0 +1,172 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +/*! \file + \brief Cutlass provides helper template functions to figure out the right + data structures to instantiate to run a GEMM with various parameters (see + `cutlass/gemm/threadblock/default_mma.h`). However, due to template + instantiation priority rules, it will only create an MmaMultiStage with + kStages=3 (otherwise creates an MmePipelined - which is not compatible with + FastF32). kStages=3 uses too much shared memory and we want to use kStages=2, + so we just copy-pasted some code from `default_mma.h` and + `default_mma_core.h` files and wrapped this template to allow our use case. + + This is really only for the FastF32 case - aka using TensorCores with fp32. +*/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace cutlass { +namespace gemm { +namespace threadblock { + +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Layout type for C and D matrix operand + typename LayoutC, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + typename Enable_ = void> +struct FindDefaultMma { + static constexpr bool AccumulatorsInRowMajor = false; + static constexpr SharedMemoryClearOption SharedMemoryClear = + SharedMemoryClearOption::kNone; + using DefaultMma = cutlass::gemm::threadblock::DefaultMma< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + LayoutC, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + Stages, + Operator, + AccumulatorsInRowMajor, + SharedMemoryClear>; +}; + +/// Specialization for sm80 / FastF32 / multistage with kStages=2 +template < + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + int kStages, + typename Operator> +struct FindDefaultMma< + ElementA_, + LayoutA_, + kAlignmentA, + ElementB_, + LayoutB_, + kAlignmentB, + ElementAccumulator, + layout::RowMajor, + arch::OpClassTensorOp, + arch::Sm80, + ThreadblockShape, + WarpShape, + InstructionShape, + kStages, + Operator, + typename cutlass::platform::enable_if<(kAlignmentA > 1)>::type> { + using LayoutC = layout::RowMajor; + using OperatorClass = arch::OpClassTensorOp; + using ArchTag = arch::Sm80; + + using DefaultMma_ = cutlass::gemm::threadblock::DefaultMma< + ElementA_, + LayoutA_, + kAlignmentA, + ElementB_, + LayoutB_, + kAlignmentB, + ElementAccumulator, + LayoutC, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + 3, + Operator>; + struct DefaultMma : DefaultMma_ { + using MmaCore_ = typename DefaultMma_::MmaCore; + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore_::Shape, + typename DefaultMma_::IteratorA, + typename MmaCore_::SmemIteratorA, + MmaCore_::kCacheOpA, + typename DefaultMma_::IteratorB, + typename MmaCore_::SmemIteratorB, + MmaCore_::kCacheOpB, + ElementAccumulator, + LayoutC, + typename MmaCore_::MmaPolicy, + kStages>; + }; +}; + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h new file mode 100644 index 0000000000000000000000000000000000000000..67502a7dd12dfbd8074aff0482d462f8e2272b76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_accum_lambda_iterator.h @@ -0,0 +1,359 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include +#include +#include + +/* +TensorCores have different accumulator layouts. +This file provides a class to easily map the accumulator +i-th element with the corresponding matrix row/col. +*/ + +template +struct AccumLambdaIteratorSm80 { + static_assert( + cutlass::platform:: + is_same::value, + "only RowMajor is supported"); + + using Policy = typename T::Policy; + using InstructionShape = typename T::InstructionShape; + using OpDelta = typename T::OpDelta; + using Shape = typename T::Shape; + static int const kElementsPerAccess = InstructionShape::kN / 4; + static int const kRowsPerTile = 8; + static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile; + + static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset( + int8_t lane_id, + int8_t warp_id, + typename T::TensorCoord const& tile_offset) { + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + return cutlass::MatrixCoord( + quad + tile_offset.row() * Shape::kRow, + lane_in_quad * kElementsPerAccess + + tile_offset.column() * Shape::kColumn); + } + + template + CUTLASS_DEVICE static void iterateRows( + cutlass::MatrixCoord& lane_offset, + FA beginRow, + FB op, + FC endRow) { + // See cutlass/gemm/warp/mma_tensor_op_tile_iterator.h + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile + lane_offset.row(); + beginRow(accum_m); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + + col + lane_offset.column(); + int idx = mma_accum_start + row * kElementsPerAccess + col; + op(accum_m, accum_n, idx); + } + } + + endRow(accum_m); + } + } + } + + template + CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) { + // In each warp, 4 threads will work on the same row + // - the ones with the same `quad` + auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1); + myValue = fn(myValue, otherV); + otherV = __shfl_xor_sync(0xffffffff, myValue, 2); + myValue = fn(myValue, otherV); + int lane_in_quad = (lane_id & 3); + return lane_in_quad == 0; + } +}; + +template +struct AccumLambdaIteratorSm70 { + static_assert( + cutlass::platform:: + is_same::value, + "only RowMajor is supported"); + + using Policy = typename T::Policy; + using InstructionShape = typename T::InstructionShape; + using OpDelta = typename T::OpDelta; + using Shape = typename T::Shape; + using Element = accum_t; + + static int const kElementsPerPartial = 4; + using EleShapePerPatial = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::MatrixShape<2, 2>, + cutlass::MatrixShape<1, 4>>::type; + static int const kElementsPerMma = 8; + static int const kAccumulatorPatials = 2; + using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>; + + static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset( + int8_t lane_id, + int8_t warp_id, + typename T::TensorCoord const& tile_offset) { + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + int accum_m, accum_n; + + if (cutlass::platform::is_same::value) { + // (quad[2],quad[0])+lane_in_quad[0] + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1); + // (quad[1])+lane_in_quad[1] + accum_n = + ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials + + (lane_in_quad & 2); + } else { + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + + lane_in_quad; // (quad[2],quad[0]) + accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials; + } + return cutlass::MatrixCoord( + accum_m + tile_offset.row() * Shape::kRow, + accum_n + tile_offset.column() * Shape::kColumn); + } + + template + CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) { + static_assert( + cutlass::platform::is_same::value, + "update to support non-float accum"); + // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-884-f16 + // T0 & T2 share same line within a quad + auto otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 1); + myValue = fn(myValue, otherV); + // quad 0 and quad 2 are on the same lines + otherV = __shfl_xor_sync(0xffffffff, myValue, 1 << 3); + myValue = fn(myValue, otherV); + return (lane_id & ((1 << 1) | (1 << 3))) == 0; + } + + template + CUTLASS_DEVICE static void iterateRows( + cutlass::MatrixCoord& lane_offset, + FA beginRow, + FB op, + FC endRow) { + CUTLASS_PRAGMA_UNROLL + for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < EleShapePerPatial::kRow; ++m) { + int accum_m = tile_m * Policy::InterleavedTile::kRow + + mma_m * QuadShapePerPatialMma::kRow + m * 2 + lane_offset.row(); + beginRow(accum_m); + + CUTLASS_PRAGMA_UNROLL + for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; + ++tile_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; + ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < kAccumulatorPatials; ++p) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < EleShapePerPatial::kColumn; ++n) { + int mma_accum_start = + (((tile_n * Policy::TileIterations::kRow + tile_m) * + Policy::MmaIterations::kColumn + + mma_n) * + Policy::MmaIterations::kRow + + mma_m) * + kElementsPerMma; + int accum_n = tile_n * Policy::InterleavedTile::kColumn + + mma_n * QuadShapePerPatialMma::kColumn + + p * Policy::InterleavedTile::kColumn / 2 + n + + lane_offset.column(); + int idx = mma_accum_start + p * kElementsPerPartial + + m * EleShapePerPatial::kColumn + n; + op(accum_m, accum_n, idx); + } + } + } + } + endRow(accum_m); + } + } + } + } +}; + +template +struct AccumLambdaIteratorSimt { + using Policy = typename T::Policy; + using Iterations = typename T::Iterations; + using Element = typename T::Element; + using Delta = typename T::Delta; + using Shape = typename T::Shape; + static_assert( + cutlass::platform:: + is_same::value, + "only RowMajor is supported"); + + template + CUTLASS_DEVICE static bool reduceSameRow(int lane_id, DT& myValue, F fn) { + CUTLASS_PRAGMA_UNROLL + for (int bit = 1; bit < Policy::WarpShape::kColumn; bit *= 2) { + auto otherV = __shfl_xor_sync(0xffffffff, myValue, bit); + myValue = fn(myValue, otherV); + } + return (lane_id & (Policy::WarpShape::kColumn - 1)) == 0; + } + + template + CUTLASS_DEVICE static void iterateRows( + cutlass::MatrixCoord& lane_offset, + FA beginRow, + FB op, + FC endRow) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) { + int accum_m = mma_m * Delta::kRow + m + lane_offset.row(); + beginRow(accum_m); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) { + int accum_n = + mma_n * Policy::WarpShape::kColumn * Policy::LaneMmaShape::kN + + lane_offset.column(); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) { + int idx = n + + Policy::LaneMmaShape::kN * + (mma_n + + Iterations::kColumn * + (m + mma_m * Policy::LaneMmaShape::kM)); + op(accum_m, accum_n + n, idx); + } + } + endRow(accum_m); + } + } + } + + static cutlass::MatrixCoord CUTLASS_DEVICE get_lane_offset( + int8_t lane_id, + int8_t warp_id, + typename T::TensorCoord const& tile_offset) { + static_assert( + cutlass::platform::is_same< + typename Policy::LaneLayout, + cutlass::layout::RowMajorInterleaved<1>>::value, + ""); + typename Policy::LaneLayout lane_layout = Policy::get_lane_layout(); + + cutlass::MatrixCoord lane_offset = lane_layout.inverse(lane_id) * + cutlass::MatrixCoord(Policy::LaneMmaShape::kM, + Policy::LaneMmaShape::kN); + return lane_offset + + tile_offset * cutlass::MatrixCoord(Shape::kRow, Shape::kColumn); + } +}; + +template +struct DefaultMmaAccumLambdaIterator; + +// Simt +template +struct DefaultMmaAccumLambdaIterator< + cutlass::gemm::warp::MmaSimtTileIterator< + S, + cutlass::gemm::Operand::kC, + accum_t, + cutlass::layout::RowMajor, + P, + 1, + 1>, + accum_t, + kWarpSize> { + using WarpIterator = typename cutlass::gemm::warp::MmaSimtTileIterator< + S, + cutlass::gemm::Operand::kC, + accum_t, + cutlass::layout::RowMajor, + P, + 1, + 1>; + using Iterator = AccumLambdaIteratorSimt; +}; + +// TensorOp - Volta +template +struct DefaultMmaAccumLambdaIterator< + cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + cutlass::MatrixShape<1, 1>>, + accum_t, + kWarpSize> { + using WarpIterator = + typename cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + cutlass::MatrixShape<1, 1>>; + using Iterator = AccumLambdaIteratorSm70; +}; + +// TensorOp - Sm75+ +template < + typename S1, + typename S2, + typename S3, + typename accum_t, + int kWarpSize> +struct DefaultMmaAccumLambdaIterator< + cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + S3>, + accum_t, + kWarpSize> { + using WarpIterator = + typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + S1, + accum_t, + cutlass::layout::RowMajor, + S2, + S3>; + using Iterator = AccumLambdaIteratorSm80; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h new file mode 100644 index 0000000000000000000000000000000000000000..294d37d510f4339fb4bb794a203cc148a6402aa6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm/mma_from_smem.h @@ -0,0 +1,1953 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2022 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +/// Shared storage object needed by accumulator +/// From 13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h +template < + typename Shape_, + typename Element_, + typename Layout_, + typename Padding_> +class AccumulatorSharedStorage { + public: + // + // Type definitions + // + using Shape = Shape_; + using Element = Element_; + using Layout = Layout_; + using Padding = Padding_; + + /// Tensor reference to the accumulator + using TensorRefAccum = cutlass::TensorRef; + + /// Shape of the accumulator matrix in shared memory + using ShapeAccum = cutlass:: + MatrixShape; + + public: + // + // Data members + // + + /// Buffer for accumulator + cutlass::AlignedBuffer accum; + + public: + // + // Methods + // + + /// Returns a layout object for the Accum matrix + CUTLASS_DEVICE + static Layout LayoutAccum() { + return Layout::packed({ShapeAccum::kRow, ShapeAccum::kColumn}); + } + + /// Returns a TensorRef to the Accumulator + CUTLASS_HOST_DEVICE + TensorRefAccum accum_ref() { + return TensorRefAccum{accum.data(), LayoutAccum()}; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Taken from +// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + // Maximum K dimension - also the dimension of the shared-memory + // holding `OperandA` + int kMaxK_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Layout in shared-memory of operand A + typename SmemLayoutA, + /// Used for partial specialization + typename Enable = bool> +class MmaBaseFromSharedMemory { + public: + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + static constexpr int kMaxK = kMaxK_; + + ///< Policy describing tuning details + using Policy = Policy_; + + // + // Dependent types + // + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Shape describing the overall GEMM computed from shared memory + /// by each warp. + using WarpGemm = typename Policy::Operator::Shape; + + /// Shape describing the number of warps filling the CTA + using WarpCount = GemmShape< + Shape::kM / WarpGemm::kM, + Shape::kN / WarpGemm::kN, + Shape::kK / WarpGemm::kK>; + using WarpCount1 = WarpCount; + + /// Number of warp-level GEMM operations + static int const kWarpGemmIterations = + (WarpGemm::kK / Operator::Policy::MmaShape::kK); + static int const kWarpGemmIterations1 = kWarpGemmIterations; + + /// Number of stages + static int const kStages = Stages; + + /// If this is true, we fill the entire shmem buffer at start + /// and don't need to iterate through it in a circular fashion + static bool const kSmemContainsEntireB = kMaxK <= Shape::kK * kStages; + + /// Tensor reference to the A operand + using TensorRefA = TensorRef; + + /// Tensor reference to the B operand + using TensorRefB = + TensorRef; + + // + // Nested structs + // + + /// Shared storage object needed by threadblock-scoped GEMM + class SharedStorage { + public: + // + // Type definitions + // + + /// Shape of the B matrix operand in shared memory + using ShapeB = MatrixShape< + Shape::kK * kStages + Policy::SmemPaddingB::kRow, + Shape::kN + Policy::SmemPaddingB::kColumn>; + + public: + // + // Data members + // + + /// Buffer for B operand + AlignedBuffer operand_B; + + public: + // + // Methods + // + + /// Returns a layout object for the B matrix + CUTLASS_HOST_DEVICE + static typename Operator::LayoutB LayoutB() { + return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn}); + } + + /// Returns a TensorRef to the B operand + CUTLASS_HOST_DEVICE + TensorRefB operand_B_ref() { + return TensorRefB{operand_B.data(), LayoutB()}; + } + }; + + protected: + // + // Data members + // + + // /// Iterator to load a warp-scoped tile of A operand from shared memory + // typename Operator::IteratorA warp_tile_iterator_A_; + + /// Iterator to load a warp-scoped tile of B operand from shared memory + typename Operator::IteratorB warp_tile_iterator_B_; + + public: + /// Construct from tensor references + CUTLASS_DEVICE + MmaBaseFromSharedMemory( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + TensorRefB& b_tile, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : warp_tile_iterator_B_(b_tile, lane_idx) {} +}; + +namespace { + +// has necessary trait compliance with WarpIteratorFromSmem but doesn't do +// anything, can be default initialized, and uses fragment that takes up +// (almost) no space. this warp iterator is selected at compile time when +// elementwise on-the-fly scaling for operand A is disabled, in which case +// operations related to loading scale factors for operand A get wiped out by +// the compiler. +template +class NoOpWarpIteratorScale { + public: + // in pipelined+multistage MMA implementations we keep an array of fragments. + // if we aren't using scaling we don't want to waste registers on fragments + // of scale elements, so ideally this would be sized 0. + // Since arrays of zero-sized objects are not allowed, using size as 1. + // The compiler will most likely wipe it out anyways. + using Fragment = cutlass::Array; + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale() {} + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale(TensorRef const&, int) {} + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale& add_tile_offset( + typename TensorRef::TensorCoord const&) { + return *this; + } + + CUTLASS_HOST_DEVICE + NoOpWarpIteratorScale& operator++() { + return *this; + } + + CUTLASS_DEVICE + void load(Fragment&) const {} +}; + +// if scaling is enabled, performs fragment elementwise multiplication between +// fragment and its scaling factor. +template +class FragmentElementwiseScaler; + +// specialization for scaling being enabled. +template +class FragmentElementwiseScaler { + public: + // cast scale_frag to correct type then apply elementwise to fragment + CUTLASS_DEVICE + static Fragment apply(Fragment frag, FragmentScale const& scale_frag) { + Fragment converted_scale_frag = cutlass::NumericArrayConverter< + typename Fragment::Element, + typename FragmentScale::Element, + FragmentScale::kElements>()(scale_frag); + return cutlass::multiplies()(frag, converted_scale_frag); + } +}; + +// specialization for scaling being disabled. doesn't do anything and should +// just get wiped out by the compiler. +template +class FragmentElementwiseScaler { + public: + CUTLASS_DEVICE + static Fragment apply(Fragment frag, FragmentScale const&) { + return frag; + } +}; +} // namespace + +//////////////////////////////////////////////////////////////////////////////// +// Taken from +// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + // BEGIN smem + /// Iterates over the intermediate accumulator tile in shared memory + typename WarpIteratorA_, + /// whether or not to perform elementwise multiplication of A + // by another matrix (A_scale) that is also kept in shared memory prior + // to matmul A @ B + bool ScaleOperandA_, + /// Max GEMM problem size in K dimension + int MaxK, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to B operand + typename TransformB_ = NumericArrayConverter< + typename SmemIteratorB_::Element, + typename IteratorB_::Element, + IteratorB_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool> +class MmaPipelinedFromSharedMemory : public MmaBaseFromSharedMemory< + Shape_, + MaxK, + Policy_, + 2, + typename WarpIteratorA_::Layout> { + public: + ///< Base class + using Base = MmaBaseFromSharedMemory< + Shape_, + MaxK, + Policy_, + 2, + typename WarpIteratorA_::Layout>; + + using Shape = + Shape_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + static constexpr bool ScaleOperandA = ScaleOperandA_; + + using WarpIteratorA = WarpIteratorA_; + ///< loads fragments of A_scale from shared memory if operand A scaling is + ///< enabled. otherwise no-op. + using WarpIteratorAScale = typename cutlass::platform::conditional< + ScaleOperandA, + WarpIteratorA, + NoOpWarpIteratorScale>::type; + + using IteratorB = + IteratorB_; ///< Iterates over tiles of B operand in global memory + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + using Policy = Policy_; ///< Policy describing tuning details + + using SmemIteratorB = SmemIteratorB_; + + using TransformB = TransformB_; + + // + // Dependent types + // + + /// Fragment of operand B loaded from global memory + using FragmentB = typename IteratorB::Fragment; + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + // statically assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert( + (Base::kStages == 2), + "MmaPipelined requires kStages set to value 2"); + + private: + using WarpFragmentA = typename Operator::FragmentA; + + /// fragment type of OperandA elementwise scaling matrix. (almost) empty + /// if operand A scaling is disabled. + using WarpFragmentAScale = typename WarpIteratorAScale::Fragment; + + using WarpFragmentB = typename Operator::FragmentB; + + /// applies scaling factor to operand A fragment if operand A scaling is + /// enabled. otherwise no-op. + using FragmentAScaler = FragmentElementwiseScaler< + WarpFragmentA, + WarpFragmentAScale, + ScaleOperandA>; + + protected: + // /// Iterator to write threadblock-scoped tile of A operand to shared memory + // SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + + /// Iterator to load a warp-scoped tile of A operand from intermediate + /// accumulator tile + WarpIteratorA warp_tile_iterator_A_; + + /// Iterator to load a warp-scoped tile of A_scale from intermediate + /// accumulator tile (only used if ScaleOperandA_ is true) + WarpIteratorAScale warp_tile_iterator_A_scale_; + + public: + /// constructor for MMA with operand A scaling enabled. + CUTLASS_DEVICE + MmaPipelinedFromSharedMemory( + typename Base::TensorRefA a, // Operand A in shared memory + typename Base::TensorRefA a_scale, // Operand A_scale in shared memory + typename Base::TensorRefB + b_staging, // staging memory for loading tiles of B + int thread_idx, + int warp_idx, + int lane_idx) + : Base(b_staging, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A_(a, lane_idx), + warp_tile_iterator_A_scale_(a_scale, lane_idx), + smem_iterator_B_(b_staging, thread_idx) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_A_scale_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + /// Construct from tensor references + CUTLASS_DEVICE + MmaPipelinedFromSharedMemory( + typename Base::TensorRefA a, ///< Operand A in shared memory + typename Base::TensorRefB b_staging, ///< staging memory for loading B + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx) ///< ID of each thread within a warp + : Base(b_staging, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A_(a, lane_idx), + smem_iterator_B_(b_staging, thread_idx) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + // For API compatibility with MmaMultistageFromSharedMemory + // but not supported as it worsens perf: older gpus < sm80 don't + // support async transfers and have to waste registers + CUTLASS_DEVICE + void set_prologue_done(bool value) {} + CUTLASS_DEVICE + static void prologue( + typename Base::SharedStorage& shared_storage, + IteratorB iterator_B1, + int thread_idx, + int problem_size_0_n) {} + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations, ///< number of iterations of the mainloop + FragmentC& accum, ///< destination accumulator tile + // IteratorA iterator_A, ///< iterator over A + // operand in global memory + IteratorB iterator_B, ///< iterator over B operand in global memory + FragmentC const& src_accum, ///< source accumulator tile + // TransformA transform_A = TransformA(), ///< transformation + // applied to A fragment + TransformB transform_B = + TransformB()) { ///< transformation applied to B fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + FragmentB tb_frag_B; + + tb_frag_B.clear(); + + // The last kblock is loaded in the prolog + iterator_B.set_residual_tile(gemm_k_iterations == 1); + iterator_B.load(tb_frag_B); + + ++iterator_B; + + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + + ++this->smem_iterator_B_; + + __syncthreads(); + + // remember that WarpFragmentAScale and WarpIteratorAScale are empty/no-op + // if scaling is disabled. + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpFragmentA warp_frag_A[2]; + WarpFragmentAScale warp_frag_A_scale[2]; + WarpFragmentB warp_frag_B[2]; + warp_frag_A[0].clear(); + warp_frag_A_scale[0].clear(); + warp_frag_B[0].clear(); + + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_A[0]); + this->warp_tile_iterator_A_scale_.load(warp_frag_A_scale[0]); + this->warp_tile_iterator_B_.load(warp_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_A_scale_; + ++this->warp_tile_iterator_B_; + + Operator warp_mma; + + int smem_write_stage_idx = 1; + + // Avoid reading out of bounds + iterator_B.set_residual_tile(gemm_k_iterations == 2); + iterator_B.clear_mask(gemm_k_iterations <= 1); + + // Issue loads during the first warp-level matrix multiply-add *AFTER* + // issuing shared memory loads (which have the tightest latency + // requirement). + + // + // Mainloop + // + + // Note: The main loop does not support Base::kWarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > 0; --gemm_k_iterations) { + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + bool hasNext = true; + + if (warp_mma_k == Base::kWarpGemmIterations - 1) { + if (gemm_k_iterations > 1) { + // Write fragments to shared memory + this->smem_iterator_B_.store(transform_B(tb_frag_B)); + } + + __syncthreads(); + + ++this->smem_iterator_B_; + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory SMEM: Don't reset iterator A, as + // we are continuing our iteration at this point + if (smem_write_stage_idx == 1) { + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + } else { + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + } + + smem_write_stage_idx ^= 1; + hasNext = gemm_k_iterations > 1; + } + + // Only read the next if we need to + if (hasNext) { + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_A_scale_.load( + warp_frag_A_scale[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_A_scale_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k == 0) { + iterator_B.load(tb_frag_B); + + ++iterator_B; + + // Avoid reading out of bounds if this was the last loop iteration + iterator_B.set_residual_tile(gemm_k_iterations == 3); + iterator_B.clear_mask(gemm_k_iterations <= 2); + } + } + + warp_mma( + accum, + FragmentAScaler::apply( + warp_frag_A[warp_mma_k % 2], warp_frag_A_scale[warp_mma_k % 2]), + warp_frag_B[warp_mma_k % 2], + accum); + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Taken from +// https://github.com/NVIDIA/cutlass/blob/master/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Iterates over the intermediate accumulator tile in shared memory + typename WarpIteratorA1_, + /// whether or not to perform elementwise multiplication of A + // by another matrix (A_scale) that is also kept in shared memory prior + // to matmul A @ B + bool ScaleOperandA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB1_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB1_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB1, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy1_, + /// Number of stages, + int Stages_, + int kMaxK_, + /// Used for partial specialization + typename Enable = bool> +class MmaMultistageFromSharedMemory : public MmaBaseFromSharedMemory< + Shape1_, + kMaxK_, + Policy1_, + Stages_, + typename WarpIteratorA1_::Layout> { + public: + ///< Base class + using Base = MmaBaseFromSharedMemory< + Shape1_, + kMaxK_, + Policy1_, + Stages_, + typename WarpIteratorA1_::Layout>; + + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape1 = Shape1_; + ///< Iterates over tiles of B operand in global memory + using IteratorB1 = IteratorB1_; + using IteratorB = IteratorB1; + ///< Policy describing tuning details + using Policy1 = Policy1_; + + using SmemIteratorB1 = SmemIteratorB1_; + using WarpIteratorA1 = WarpIteratorA1_; ///< Iterates over the intermediate + ///< accumulator tile in shared memory + static constexpr bool ScaleOperandA = ScaleOperandA_; + + ///< warp level iterator over A_scale matrix tile kept in shared memory. + ///< if elementwise A scaling is disabled then everything this does is no-op. + using WarpIteratorAScale = typename cutlass::platform::conditional< + ScaleOperandA, + WarpIteratorA1, + NoOpWarpIteratorScale>::type; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpB1 = CacheOpB1; + static constexpr bool kSmemContainsEntireB = Base::kSmemContainsEntireB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC1 = typename Policy1::Operator::FragmentC; + using FragmentC = FragmentC1; + + /// Warp-level Mma + using Operator1 = typename Policy1::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = arch::Sm80; + + /// Complex transform on B operand + static ComplexTransform const kTransformB1 = Operator1::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + static_assert( + Base::kWarpGemmIterations1 > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand B + static int const TBLoadIterationsB1 = + IteratorB1::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB1 = + (TBLoadIterationsB1 + Base::kWarpGemmIterations1 - 1) / + Base::kWarpGemmIterations1; + }; + + static constexpr int kNumStagesConcurrentLoad = + kSmemContainsEntireB ? Base::kStages : Base::kStages - 1; + + private: + using WarpLoadedFragmentA1 = typename Operator1::FragmentA; + /// fragment of OperandA scale matrix. if operand A scaling is disabled this + /// is (almost) empty. + using WarpLoadedFragmentA1Scale = typename WarpIteratorAScale::Fragment; + using WarpLoadedFragmentB1 = typename Operator1::FragmentB; + using WarpTransformedFragmentA1 = typename Operator1::TransformedFragmentA; + using WarpTransformedFragmentB1 = typename Operator1::TransformedFragmentB; + + /// applies elementwise scaling to fragment of A. if operand A scaling is + /// disabled this is a no-op. + using FragmentAScaler = FragmentElementwiseScaler< + WarpLoadedFragmentA1, + WarpLoadedFragmentA1Scale, + ScaleOperandA>; + + private: + // + // Data members + // + + /// Iterator to load a warp-scoped tile of A1 operand from intermediate + /// accumulator tile + WarpIteratorA1 warp_tile_iterator_A1_; + + /// Iterator to load a warp-scoped tile of A1_scale operand from shared memory + /// if operand A scaling is disabled everything this does is a no-op. + WarpIteratorAScale warp_tile_iterator_A1_scale_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB1 smem_iterator_B1_; + + bool prologue_done_; + + public: + /// constructor for MMA with operand A scaling enabled. + CUTLASS_DEVICE + MmaMultistageFromSharedMemory( + typename Base::TensorRefA a, + typename Base::TensorRefA a_scale, + typename Base::TensorRefB b_tile, + int thread_idx, + int warp_idx, + int lane_idx) + : Base(b_tile, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A1_(a, lane_idx), + warp_tile_iterator_A1_scale_(a_scale, lane_idx), + smem_iterator_B1_(b_tile, thread_idx), + prologue_done_(false) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + int warp_idx_mn_1 = + warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN); + int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN); + int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM; + int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM; + + // Add per-warp offsets in units of warp-level tiles + warp_tile_iterator_A1_.add_tile_offset( + {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1}); + warp_tile_iterator_A1_scale_.add_tile_offset( + {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1}); + } + + /// Construct from tensor references + CUTLASS_DEVICE + MmaMultistageFromSharedMemory( + typename Base::TensorRefA a, + typename Base::TensorRefB b_tile, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx) + : Base(b_tile, thread_idx, warp_idx, lane_idx), + warp_tile_iterator_A1_(a, lane_idx), + smem_iterator_B1_(b_tile, thread_idx), + prologue_done_(false) { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn_1 = + warp_idx % (Base::WarpCount1::kM * Base::WarpCount1::kN); + int warp_idx_k_1 = warp_idx / (Base::WarpCount1::kM * Base::WarpCount1::kN); + + int warp_idx_m_1 = warp_idx_mn_1 % Base::WarpCount1::kM; + int warp_idx_n_1 = warp_idx_mn_1 / Base::WarpCount1::kM; + + // Add per-warp offsets in units of warp-level tiles + warp_tile_iterator_A1_.add_tile_offset( + {warp_idx_m_1, Base::kWarpGemmIterations1 * warp_idx_k_1}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations1 * warp_idx_k_1, warp_idx_n_1}); + } + + CUTLASS_DEVICE + void set_prologue_done(bool value) { + prologue_done_ = value; + } + + CUTLASS_DEVICE + static void prologue( + typename Base::SharedStorage& shared_storage, + IteratorB iterator_B1, + int thread_idx, + int problem_size_0_n) { + SmemIteratorB1 smem_iterator_B1(shared_storage.operand_B_ref(), thread_idx); + _prologue( + iterator_B1, + (problem_size_0_n + Base::Shape::kK - 1) / Base::Shape::kK, + smem_iterator_B1); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance_1( + IteratorB1& iterator_B1, + int group_start_B1 = 0) { + iterator_B1.set_iteration_index( + group_start_B1 * IteratorB1::kAccessesPerVector); + this->smem_iterator_B1_.set_iteration_index(group_start_B1); + + // Load for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB1; ++j) { + if (group_start_B1 + j < Detail::TBLoadIterationsB1) { + typename IteratorB1::AccessType* dst_ptr = + reinterpret_cast( + this->smem_iterator_B1_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB1::ThreadMap::kElementsPerAccess / + IteratorB1::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B1.get(); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, gmem_ptr, iterator_B1.valid()); + + ++iterator_B1; + } + ++this->smem_iterator_B1_; + } + } + } + + CUTLASS_DEVICE + static void _prologue( + IteratorB& iterator_B1, + int32_t gemm_k_iterations_1, + SmemIteratorB1& smem_iterator_B1_) { + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < kNumStagesConcurrentLoad; + ++stage, --gemm_k_iterations_1) { + iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1); + iterator_B1.clear_mask(gemm_k_iterations_1 == 0); + + iterator_B1.set_iteration_index(0); + smem_iterator_B1_.set_iteration_index(0); + + // Load for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) { + typename IteratorB1::AccessType* dst_ptr = + reinterpret_cast( + smem_iterator_B1_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorB1::ThreadMap::kElementsPerAccess / + IteratorB1::kAccessesPerVector / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B1.get(), iterator_B1.valid()); + + ++iterator_B1; + } + + ++smem_iterator_B1_; + } + + // Move to the next stage + iterator_B1.add_tile_offset({1, 0}); + + smem_iterator_B1_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + iterator_B1.set_residual_tile(gemm_k_iterations_1 == 1); + iterator_B1.clear_mask(gemm_k_iterations_1 == 0); + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations_1_, + ///< destination accumulator tile + FragmentC1& accum, + ///< iterator over B1 operand in global memory + IteratorB1 iterator_B1, + ///< initial value of accumulator + FragmentC1 const& src_accum) { + // 2nd Gemm + + // + // Prologue + // + // Perform accumulation in the 'd' output operand + accum = src_accum; + + if (!prologue_done_) { + _prologue(iterator_B1, gemm_k_iterations_1_, smem_iterator_B1_); + } else if (!kSmemContainsEntireB) { + // Restore the iterators increments + + int gemm_k_iterations_1 = gemm_k_iterations_1_; + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < kNumStagesConcurrentLoad; + ++stage, --gemm_k_iterations_1) { + iterator_B1.set_iteration_index(0); + this->smem_iterator_B1_.set_iteration_index(0); + + // Load for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLoadIterationsB1; ++j) { + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB1::kAccessesPerVector; ++v) { + ++iterator_B1; + } + ++this->smem_iterator_B1_; + } + iterator_B1.add_tile_offset({1, 0}); + this->smem_iterator_B1_.add_tile_offset({1, 0}); + } + iterator_B1.set_residual_tile(gemm_k_iterations_1 <= 1); + iterator_B1.clear_mask(gemm_k_iterations_1 <= 0); + } + + // DEPBAR+SYNC + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // remember that WarpFragmentAScale and WarpIteratorAScale are no-op/empty + // if scaling is disabled. + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA1 warp_loaded_frag_A1[2]; + WarpLoadedFragmentA1Scale warp_loaded_frag_A1_scale[2]; + WarpLoadedFragmentB1 warp_loaded_frag_B1[2]; + WarpTransformedFragmentA1 warp_transformed_frag_A1[2]; + WarpTransformedFragmentB1 warp_transformed_frag_B1[2]; + + Operator1 warp_mma1; + + warp_tile_iterator_A1_.load(warp_loaded_frag_A1[0]); + ++warp_tile_iterator_A1_; + + warp_tile_iterator_A1_scale_.load(warp_loaded_frag_A1_scale[0]); + ++warp_tile_iterator_A1_scale_; + + this->warp_tile_iterator_B_.set_kgroup_index(0); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B1[0]); + ++this->warp_tile_iterator_B_; + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma1.transform( + warp_transformed_frag_A1[0], + warp_transformed_frag_B1[0], + FragmentAScaler::apply( + warp_loaded_frag_A1[0], warp_loaded_frag_A1_scale[0]), + warp_loaded_frag_B1[0]); + + // tf32x3 kernels use staging accumulation. warp_mma uses a temporary + // accumulator and this temporary accumulator is added to the final + // accumulator once in every mainloop iteration. + plus plus_accum; + + FragmentC1 tmp_accum; + + if (platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + tmp_accum.clear(); + } + + // + // Mainloop + // + + CUTLASS_PRAGMA_UNROLL + for (int gemm_k_iterations_1 = gemm_k_iterations_1_ - (Base::kStages - 1); + gemm_k_iterations_1 > (-Base::kStages + 1); + gemm_k_iterations_1--) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; + ++warp_mma_k) { + // Load warp-level tile from accumulator fragment (A) + // or shared memory (operand B) + this->warp_tile_iterator_B_.set_kgroup_index( + (warp_mma_k + 1) % Base::kWarpGemmIterations1); + // skip warp tile loading for the last kgroup (we are out of the buf) + if (gemm_k_iterations_1 > (-Base::kStages + 2) || + warp_mma_k < Base::kWarpGemmIterations1 - 1) { + warp_tile_iterator_A1_.load( + warp_loaded_frag_A1[(warp_mma_k + 1) % 2]); + warp_tile_iterator_A1_scale_.load( + warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load( + warp_loaded_frag_B1[(warp_mma_k + 1) % 2]); + } + ++warp_tile_iterator_A1_; + ++warp_tile_iterator_A1_scale_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma1.transform( + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + FragmentAScaler::apply( + warp_loaded_frag_A1[warp_mma_k % 2], + warp_loaded_frag_A1_scale[warp_mma_k % 2]), + warp_loaded_frag_B1[warp_mma_k % 2]); + + if (platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + warp_mma1( + tmp_accum, + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + tmp_accum); + + if (warp_mma_k == 0) { + accum = plus_accum(accum, tmp_accum); + tmp_accum.clear(); + } + } else { + warp_mma1( + accum, + warp_transformed_frag_A1[warp_mma_k % 2], + warp_transformed_frag_B1[warp_mma_k % 2], + accum); + } + + // Issue global->shared copies for the this stage + if (warp_mma_k < Base::kWarpGemmIterations1 - 1) { + int group_start_iteration_B1; + + group_start_iteration_B1 = warp_mma_k * Detail::kAccessesPerGroupB1; + + if (!kSmemContainsEntireB) { + copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1); + } + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations1) { + int group_start_iteration_B1; + group_start_iteration_B1 = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB1; + + if (!kSmemContainsEntireB) { + copy_tiles_and_advance_1(iterator_B1, group_start_iteration_B1); + } + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_B1.add_tile_offset({1, 0}); + + this->smem_iterator_B1_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (!kSmemContainsEntireB) { + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy1::kPartitionsK * + Base::kWarpGemmIterations1, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + } + + iterator_B1.set_residual_tile(gemm_k_iterations_1 == 2); + iterator_B1.clear_mask(gemm_k_iterations_1 == 1); + } + + // Do any conversions feeding the first stage at the end of the loop so + // we can start right away on mma instructions + if (warp_mma_k + 1 == Base::kWarpGemmIterations1) + warp_mma1.transform( + warp_transformed_frag_A1[(warp_mma_k + 1) % 2], + warp_transformed_frag_B1[(warp_mma_k + 1) % 2], + FragmentAScaler::apply( + warp_loaded_frag_A1[(warp_mma_k + 1) % 2], + warp_loaded_frag_A1_scale[(warp_mma_k + 1) % 2]), + warp_loaded_frag_B1[(warp_mma_k + 1) % 2]); + } + } + + if (platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddFastF32>::value || + platform::is_same< + typename Operator1::MathOperator, + arch::OpMultiplyAddComplexFastF32>::value) { + accum = plus_accum(accum, tmp_accum); + } + } +}; + +// Converts a "regular" Mma into their counterpart from shared memory +template < + typename Mma_, + int kMaxK, + typename WarpIteratorA_, + /// whether or not to apply elementwise multiplication of operand A by + /// another matrix in shared memory before usage in A @ B + bool kScaleOperandA, + bool kTransposeA = false> +struct DefaultMmaFromSharedMemory; + +// Mma pipelined +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + typename WarpIteratorA_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Transformation applied to A operand + typename TransformA_, + /// Transformation applied to B operand + typename TransformB_, + // Max MMA problem size K + int kMaxK, + /// whether or not to apply elementwise multiplication of operand A by + /// another matrix in shared memory before usage in A @ B + bool kScaleOperandA, + bool kTransposeA> +struct DefaultMmaFromSharedMemory< + MmaPipelined< + Shape_, + IteratorA_, + SmemIteratorA_, + IteratorB_, + SmemIteratorB_, + ElementC_, + LayoutC_, + Policy_, + TransformA_, + TransformB_>, + kMaxK, + WarpIteratorA_, + kScaleOperandA, + kTransposeA> { + using RegularMma = MmaPipelined< + Shape_, + IteratorA_, + SmemIteratorA_, + IteratorB_, + SmemIteratorB_, + ElementC_, + LayoutC_, + Policy_, + TransformA_, + TransformB_>; + + using WarpShape = typename Policy_::Operator::Shape; + using InstructionShape = typename Policy_::Operator::InstructionShape; + using ArchMmaOperator = typename Policy_::Operator; + + static constexpr bool kIsTransposedA = false; + using WarpIteratorA = WarpIteratorA_; + using IteratorB = + typename cutlass::transform::threadblock::MakeIteratorResidualLast< + IteratorB_>::Iterator; + + using Mma = typename cutlass::gemm::threadblock::MmaPipelinedFromSharedMemory< + Shape_, + WarpIteratorA, + kScaleOperandA, + kMaxK, + IteratorB, + SmemIteratorB_, + ElementC_, + LayoutC_, + Policy_>; +}; + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + typename WarpIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Use zfill or predicate for out-of-bound cp.async + SharedMemoryClearOption SharedMemoryClear, + int kMaxK, + /// whether or not to apply elementwise multiplication of operand A by + /// another matrix in shared memory before usage in A @ B + bool kScaleOperandA, + bool kTransposeA> +struct DefaultMmaFromSharedMemory< + MmaMultistage< + Shape_, + IteratorA_, + SmemIteratorA_, + CacheOpA, + IteratorB_, + SmemIteratorB_, + CacheOpB, + ElementC_, + LayoutC_, + Policy_, + Stages, + SharedMemoryClear>, + kMaxK, + WarpIteratorA_, + kScaleOperandA, + kTransposeA> { + using RegularMma = MmaMultistage< + Shape_, + IteratorA_, + SmemIteratorA_, + CacheOpA, + IteratorB_, + SmemIteratorB_, + CacheOpB, + ElementC_, + LayoutC_, + Policy_, + Stages, + SharedMemoryClear>; + + using WarpShape = typename Policy_::Operator::Shape; + using InstructionShape = typename Policy_::Operator::InstructionShape; + using WarpIteratorTranspose = TransposeWarpIterator; + static constexpr bool kIsTransposedA = + WarpIteratorTranspose::kSupportsTranspose && kTransposeA; + using WarpIteratorA = typename platform::conditional< + kIsTransposedA, + typename WarpIteratorTranspose::Iterator, + WarpIteratorA_>::type; + + // Reduce the number of stages if we don't need that many + static int constexpr kStagesMax = + (kMaxK + int(Shape_::kK) - 1) / int(Shape_::kK); + static int constexpr kStages = cutlass::const_min(Stages, kStagesMax); + + using IteratorB = + typename cutlass::transform::threadblock::MakeIteratorResidualLast< + IteratorB_>::Iterator; + using Mma = + typename cutlass::gemm::threadblock::MmaMultistageFromSharedMemory< + Shape_, + WarpIteratorA, + kScaleOperandA, + IteratorB, + SmemIteratorB_, + RegularMma::kCacheOpB, + ElementC_, + LayoutC_, + Policy_, + kStages, + kMaxK>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename IteratorC, + typename Operator, + typename scalar_t, + typename WarpShape_, + typename ThreadblockShape_> +struct B2bGemm; + +// Tensor Cores >= Sm75 specialization (Ampere ...) +template < /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Element type + typename Element_, + /// Layout of operand in memory + typename Layout_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_, + typename Operator, + typename scalar_t, + typename WarpShape_, + typename ThreadblockShape_> +struct B2bGemm< + cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + Shape_, + Element_, + Layout_, + InstructionShape_, + OpDelta_>, + Operator, + scalar_t, + WarpShape_, + ThreadblockShape_> { + using IteratorC = + typename cutlass::gemm::warp::MmaTensorOpAccumulatorTileIterator< + Shape_, + Element_, + Layout_, + InstructionShape_, + OpDelta_>; + using FragmentC = typename IteratorC::Fragment; + using InstructionShape = InstructionShape_; + using WarpShape = WarpShape_; + using ThreadblockShape = ThreadblockShape_; + using accum_t = Element_; + using lse_scalar_t = float; + + using SmemAccumulatorLayout = cutlass::layout::RowMajor; + + // Iterator to load accumulators (results of matmul in registers) + using FragmentIteratorAccumulator = + cutlass::epilogue::warp::FragmentIteratorTensorOp< + WarpShape, + InstructionShape, + accum_t, + typename Operator::Policy::Operator::FragmentC, + cutlass::layout::RowMajor>; + + // Iterator to store to shared-memory + using SmemIteratorD0 = typename cutlass::epilogue::warp::TileIteratorTensorOp< + WarpShape, + InstructionShape, + scalar_t, // accum_t, + SmemAccumulatorLayout>; + using AccumulatorSharedStorage = + cutlass::gemm::threadblock::AccumulatorSharedStorage< + ThreadblockShape, + typename SmemIteratorD0::Element, + typename SmemIteratorD0::TensorLayout, + typename SmemIteratorD0::Padding>; + // We need to provide an operation for the epilogue. Let's create an + // operation that does nothing (ScaleType::Nothing), just converts + // from accum_t (float) -> scalar_t (can be half) + using OutputOpNoOp = cutlass::epilogue::thread::LinearCombination< + typename SmemIteratorD0::Element, // ElementOutput + FragmentIteratorAccumulator::Fragment::kElements, + accum_t, // ElementAccumulator + typename SmemIteratorD0::Element, // ElementCompute + cutlass::epilogue::thread::ScaleType::Nothing>; + using Epilogue = cutlass::epilogue::threadblock::EpilogueSmemAccumulator< + SmemIteratorD0, + FragmentIteratorAccumulator, + SmemIteratorD0, // ScaleBiasIterator - not used + OutputOpNoOp>; + + // Epilogue 2: with LSE (for backwards pass) + static int const kElementsPerAccess = 2; // TODO: Why 2? + using IteratorAccumulatorLSE = + cutlass::transform::threadblock::VectorIterator< + cutlass::transform::threadblock::PredicatedVectorAccessIterator< + // Shape + cutlass::MatrixShape, + // WarpShape + cutlass::MatrixShape, + lse_scalar_t, + cutlass::layout::RowMajor, + kElementsPerAccess>>; + using EpilogueOpApplyLSE = cutlass::epilogue::thread::ApplyLogSumExp< + scalar_t, // ElementOutput_ + lse_scalar_t, // ElementLSE_ + accum_t, // ElementAccumulator_ + accum_t, // ElementCompute_ + 128 / cutlass::sizeof_bits::value + // FragmentIteratorAccumulator::Fragment::kElements + // InstructionShape::kM * InstructionShape::kN / 32 + >; + using EpilogueWithLSE = + cutlass::epilogue::threadblock::EpilogueSmemAccumulator< + SmemIteratorD0, + FragmentIteratorAccumulator, + IteratorAccumulatorLSE, + EpilogueOpApplyLSE>; + + static void CUTLASS_DEVICE accumToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC const& accum, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id); + smem_iterator_attn.add_tile_offset( + tile_coords * + cutlass::MatrixCoord{ + SmemIteratorD0::TileIterations::kRow, + SmemIteratorD0::TileIterations::kColumn}); + Epilogue epilogue; + epilogue(OutputOpNoOp({}), smem_iterator_attn, accum); + } + + static void CUTLASS_DEVICE accumApplyLSEToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC& accum, + lse_scalar_t const* lse, + int32_t lse_extents, + int thread_id, + int warp_id, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + constexpr int32_t kAlignLSE = 32; + IteratorAccumulatorLSE iterator_lse( + lse, + {(int32_t)0, (int32_t)ceil_div(lse_extents, kAlignLSE) * kAlignLSE}, + thread_id, + warp_id, + cutlass::MatrixCoord{0, 0} // offset + ); + + SmemIteratorD0 smem_iterator_attn(shared_storage.accum_ref(), lane_id); + smem_iterator_attn.add_tile_offset( + tile_coords * + cutlass::MatrixCoord{ + SmemIteratorD0::TileIterations::kRow, + SmemIteratorD0::TileIterations::kColumn}); + EpilogueWithLSE epilogue; + EpilogueOpApplyLSE minus_lse_exp({}); + epilogue( + minus_lse_exp, + smem_iterator_attn, + accum, + // scale - unused + iterator_lse, + // bias + iterator_lse); + } +}; + +// Volta Specialization +// only supported for f16 +template +struct B2bGemm< + cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + cutlass::MatrixShape<32, 32>, + float, + cutlass::layout::RowMajor, + cutlass::gemm::GemmShape<16, 16, 4>, + cutlass::MatrixShape<1, 1>>, + Operator, + cutlass::half_t, + WarpShape_, + ThreadblockShape_> { + using IteratorC = + cutlass::gemm::warp::MmaVoltaTensorOpAccumulatorTileIterator< + cutlass::MatrixShape<32, 32>, + float, + cutlass::layout::RowMajor, + cutlass::gemm::GemmShape<16, 16, 4>, + cutlass::MatrixShape<1, 1>>; + using scalar_t = cutlass::half_t; + using accum_t = IteratorC::Element; + using WarpShape = WarpShape_; + using ThreadblockShape = ThreadblockShape_; + using FragmentC = IteratorC::Fragment; + using lse_scalar_t = float; + + // Storage in shared-memory for Q.Kt + using SmemAccumulatorLayout = + cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>; + using AccumulatorSharedStorage = + cutlass::gemm::threadblock::AccumulatorSharedStorage< + ThreadblockShape, + scalar_t, + SmemAccumulatorLayout, + cutlass::MatrixShape<0, 0> // Padding + >; + using TensorRef = cutlass::TensorRef; + using Policy = typename IteratorC::Policy; + using Element = accum_t; + // Those are MmaVoltaTensorOpAccumulatorTileIterator private fields + // Let's copy their values + static int const kElementsPerPartial = 4; + using EleShapePerPatial = typename cutlass::platform::conditional< + cutlass::platform::is_same::value, + cutlass::MatrixShape<2, 2>, + cutlass::MatrixShape<1, 4>>::type; + static int const kElementsPerMma = 8; + static int const kAccumulatorPatials = 2; + using QuadShapePerPatialMma = cutlass::MatrixShape<4, 4>; + + static void CUTLASS_DEVICE accumToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC const& accum, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + // ctor - from MmaVoltaTensorOpAccumulatorTileIterator + TensorRef ref_(shared_storage.accum_ref()); + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + int accum_m, accum_n; + + if (cutlass::platform::is_same::value) { + // (quad[2],quad[0])+lane_in_quad[0] + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + (lane_in_quad & 1); + // (quad[1])+lane_in_quad[1] + accum_n = + ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials + + (lane_in_quad & 2); + } else { + accum_m = (((quad & 0x4) >> 1) + (quad & 0x1)) * 8 + + lane_in_quad; // (quad[2],quad[0]) + accum_n = ((quad >> 1) & 0x1) * kElementsPerPartial * kAccumulatorPatials; + } + cutlass::MatrixCoord lane_offset(accum_m, accum_n); + + // Tile offset + ref_.add_coord_offset( + tile_coords * + cutlass::MatrixCoord( + {IteratorC::Shape::kRow, IteratorC::Shape::kColumn})); + + using AccessType = cutlass::Array; + + // store - from MmaVoltaTensorOpAccumulatorTileIterator + CUTLASS_PRAGMA_UNROLL + for (int tile_n = 0; tile_n < Policy::TileIterations::kColumn; ++tile_n) { + CUTLASS_PRAGMA_UNROLL + for (int tile_m = 0; tile_m < Policy::TileIterations::kRow; ++tile_m) { + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + int mma_accum_start = + (((tile_n * Policy::TileIterations::kRow + tile_m) * + Policy::MmaIterations::kColumn + + mma_n) * + Policy::MmaIterations::kRow + + mma_m) * + kElementsPerMma; + + CUTLASS_PRAGMA_UNROLL + for (int p = 0; p < kAccumulatorPatials; ++p) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < EleShapePerPatial::kRow; ++m) { + int accum_m = tile_m * Policy::InterleavedTile::kRow + + mma_m * QuadShapePerPatialMma::kRow + m * 2; + int accum_n = tile_n * Policy::InterleavedTile::kColumn + + mma_n * QuadShapePerPatialMma::kColumn + + p * Policy::InterleavedTile::kColumn / 2; + int r = (accum_m + lane_offset.row()); + AccessType to_store; + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < EleShapePerPatial::kColumn; ++n) { + int idx = mma_accum_start + p * kElementsPerPartial + + m * EleShapePerPatial::kColumn + n; + int c = (accum_n + n + lane_offset.column()); + to_store[n] = scalar_t(accum[idx]); + } + int c = (accum_n + lane_offset.column()); + assert(r < 32); + assert(c < 32); + *reinterpret_cast( + ref_.data() + ref_.offset({r, c})) = to_store; + } + } + } + } + } + } + } + + static void CUTLASS_DEVICE accumApplyLSEToSmem( + AccumulatorSharedStorage& shared_storage, + typename IteratorC::Fragment& accum, + lse_scalar_t const* lse, + int lse_extent, + int thread_id, + int warp_id, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + // Non-optimized way to apply LSE to registers + // NOTE: accum is attn.T + // TODO: Optimize for each architecture + static constexpr int WarpSize = 32; + using AccumLambdaIterator = + typename DefaultMmaAccumLambdaIterator:: + Iterator; + auto lane_offset = + AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords); + + cutlass::Array lse_prefetched; + lse_prefetched.clear(); + int rowIdx = 0; + int colIdx = 0; + AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + ++rowIdx; + colIdx = 0; + }, + [&](int accum_m, int accum_n, int idx) { + if (rowIdx == 1) { + lse_prefetched[colIdx] = accum_n < lse_extent + ? lse[accum_n] + : platform::numeric_limits::infinity(); + } + accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]); + ++colIdx; + }, + [&](int accum_m) {}); + accumToSmem(shared_storage, accum, lane_id, tile_coords); + } +}; + +// Simt Specialization +// for f32 on Sm70-Sm75 and f16/f32 below + +template < + typename Operator, + typename OperatorPolicy, + typename scalar_t, + typename WarpShape_, + typename ThreadblockShape_> +struct B2bGemm< + cutlass::gemm::warp::MmaSimtTileIterator< + cutlass::MatrixShape<32, 32>, + cutlass::gemm::Operand::kC, + float, + cutlass::layout::RowMajor, + OperatorPolicy, + 1, + 1>, + Operator, + scalar_t, + WarpShape_, + ThreadblockShape_> { + using IteratorC = cutlass::gemm::warp::MmaSimtTileIterator< + cutlass::MatrixShape<32, 32>, + cutlass::gemm::Operand::kC, + float, + cutlass::layout::RowMajor, + OperatorPolicy, + 1, + 1>; + using accum_t = typename IteratorC::Element; + using WarpShape = WarpShape_; + using ThreadblockShape = ThreadblockShape_; + using FragmentC = typename IteratorC::Fragment; + using lse_scalar_t = float; + + // Storage in shared-memory for Q.Kt + using AccumulatorSharedStorage = + cutlass::gemm::threadblock::AccumulatorSharedStorage< + ThreadblockShape, + scalar_t, + cutlass::layout::ColumnMajor, + cutlass::MatrixShape<0, 0> // Padding + >; + + static void CUTLASS_DEVICE accumToSmem( + AccumulatorSharedStorage& shared_storage, + FragmentC const& accum, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + using Policy = typename IteratorC::Policy; + using Element = typename IteratorC::Element; + using Iterations = typename IteratorC::Iterations; + using Delta = typename IteratorC::Delta; + + auto ref_ = shared_storage.accum_ref(); + // ctor - MmaSimtTileIterator + // compute offset based on thread ID and lane layout + typename Policy::LaneLayout lane_layout = Policy::get_lane_layout(); + + MatrixCoord lane_offset = lane_layout.inverse(lane_id) * + MatrixCoord(Policy::LaneMmaShape::kM, Policy::LaneMmaShape::kN); + + ref_.add_coord_offset(lane_offset); + + // Tile offset + ref_.add_coord_offset( + tile_coords * + cutlass::MatrixCoord( + {IteratorC::Shape::kRow, IteratorC::Shape::kColumn})); + + // store - MmaSimtTileIterator + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Iterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::LaneMmaShape::kN; ++n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Iterations::kRow; ++mma_m) { + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < Policy::LaneMmaShape::kM; ++m) { + int r = + Policy::LaneMmaShape::kM * (mma_m * Policy::WarpShape::kRow) + + m; + int c = mma_n * Delta::kColumn + n; + int idx = n + + Policy::LaneMmaShape::kN * + (mma_n + + Iterations::kColumn * + (m + mma_m * Policy::LaneMmaShape::kM)); + ref_.at({r, c}) = scalar_t(accum[idx]); + } + } + } + } + } + + static void CUTLASS_DEVICE accumApplyLSEToSmem( + AccumulatorSharedStorage& shared_storage, + typename IteratorC::Fragment& accum, + lse_scalar_t const* lse, + int lse_extent, + int thread_id, + int warp_id, + int lane_id, + cutlass::MatrixCoord const& tile_coords) { + // Non-optimized way to apply LSE to registers + // NOTE: accum is attn.T + // TODO: Optimize for each architecture + static constexpr int WarpSize = 32; + using AccumLambdaIterator = + typename DefaultMmaAccumLambdaIterator:: + Iterator; + auto lane_offset = + AccumLambdaIterator::get_lane_offset(lane_id, warp_id, tile_coords); + + cutlass::Array lse_prefetched; + lse_prefetched.clear(); + int rowIdx = 0; + int colIdx = 0; + AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + ++rowIdx; + colIdx = 0; + }, + [&](int accum_m, int accum_n, int idx) { + if (rowIdx == 1) { + lse_prefetched[colIdx] = accum_n < lse_extent + ? lse[accum_n] + : platform::numeric_limits::infinity(); + } + accum[idx] = expf(accum[idx] - lse_prefetched[colIdx]); + ++colIdx; + }, + [&](int accum_m) {}); + accumToSmem(shared_storage, accum, lane_id, tile_coords); + } +}; + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..7d5068ca56480f6e99198b0003aa7d87af68eaab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/gemm_kernel_utils.h @@ -0,0 +1,214 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include + +//////////////////////////////////////////////////////////////////////////////// +// Some helper functions +//////////////////////////////////////////////////////////////////////////////// +#define DISPATCH_TYPES(tensor, func) \ + { \ + if (query.scalar_type() == at::ScalarType::Float) { \ + using scalar_t = float; \ + func(); \ + } else if (query.scalar_type() == at::ScalarType::Half) { \ + using scalar_t = cutlass::half_t; \ + func(); \ + } else if (query.scalar_type() == at::ScalarType::BFloat16) { \ + using scalar_t = cutlass::bfloat16_t; \ + func(); \ + } else { \ + TORCH_CHECK(false, "Only fp32, half & bf16 supported at the moment"); \ + } \ + } + +#define DISPATCH_BOOL(BOOL_V, BOOL_NAME, F) \ + { \ + if (BOOL_V) { \ + constexpr bool BOOL_NAME = true; \ + F(); \ + } else { \ + constexpr bool BOOL_NAME = false; \ + F(); \ + } \ + } +#define DISPATCH_ARCHTAG(CC, func) \ + { \ + if (CC >= 80) { \ + using ArchTag = cutlass::arch::Sm80; \ + func(); \ + } else if (CC >= 75) { \ + using ArchTag = cutlass::arch::Sm75; \ + func(); \ + } else if (CC >= 70) { \ + using ArchTag = cutlass::arch::Sm70; \ + func(); \ + } else if (CC >= 50) { \ + using ArchTag = cutlass::arch::Sm50; \ + func(); \ + } else { \ + TORCH_CHECK( \ + false, \ + "Your device is too old. We require compute capability >= 50"); \ + } \ + } + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } + +namespace gemm_kernel_utils { + +template +constexpr CUTLASS_HOST_DEVICE integer ceil_div(integer n, integer m) { + return (n + m - 1) / m; +} + +template +constexpr CUTLASS_HOST_DEVICE integer align_up(integer n, integer m) { + return ((n + m - 1) / m) * m; +} + +//////////////////////////////////////////////////////////////////////////////// +// Determine the type of GEMM we do (TensorCores or not, Shapes ...) +// TODO: Maybe we could rely on Cutlass's DefaultGemm templates +//////////////////////////////////////////////////////////////////////////////// + +// Fallback to Simt (FMA on cuda cores) if not in a special case below +template +struct DefaultGemmType { + static constexpr int ThreadK = 8; + static constexpr int WarpK = 8; + static constexpr int kMinimumAlignment = 1; + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + using OpClass = cutlass::arch::OpClassSimt; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +// Specialization for tensorcores with f32 +template +struct DefaultGemmType< + ArchTag, + float, + typename cutlass::platform::enable_if< + ArchTag::kMinComputeCapability >= 80>::type> { + static constexpr int ThreadK = 32; + static constexpr int WarpK = 32; + static constexpr int kMinimumAlignment = 4; + using OpClass = cutlass::arch::OpClassTensorOp; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Operator = cutlass::arch::OpMultiplyAddFastF32; +}; + +// Specialization for tensorcores with f16/bf16 - Sm75+ +template +struct DefaultGemmType< + ArchTag, + scalar_t, + typename cutlass::platform::enable_if< + ArchTag::kMinComputeCapability >= 75 && + cutlass::sizeof_bits::value == 16>::type> { + static constexpr int ThreadK = 32; + static constexpr int WarpK = 32; + static constexpr int kMinimumAlignment = 4; + using OpClass = cutlass::arch::OpClassTensorOp; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +// Specialization for tensorcores with f16 - Volta +template <> +struct DefaultGemmType { + static constexpr int ThreadK = 32; + static constexpr int WarpK = 32; + static constexpr int kMinimumAlignment = 2; + using OpClass = cutlass::arch::OpClassTensorOp; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +// Enables to do +// `auto x = kCondition ? fa(arg) : fb(arg)` +// when `fa` and `fb` have different types +template +struct call_conditional; + +template +struct call_conditional { + template + static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) + -> decltype(ta(arg)) { + return ta(arg); + } +}; + +template +struct call_conditional { + template + static CUTLASS_HOST_DEVICE auto apply(TA ta, TB tb, Arg arg) + -> decltype(tb(arg)) { + return tb(arg); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// Mark a variable as warp-uniform - enables some compiler optimizations +// The cheapest way to do it is just to broadcast it from lane 0 +//////////////////////////////////////////////////////////////////////////////// + +template +CUTLASS_DEVICE T warp_uniform(T value) { + struct { + union { + T value; + uint32_t asInt; + }; + } p; + p.value = value; + p.asInt = __shfl_sync(0xffffffff, (unsigned)p.asInt, 0); + return p.value; +} + +template +CUTLASS_DEVICE T* warp_uniform(T* ptr) { + struct { + union { + T* ptr; + uint32_t asInt[2]; + }; + } p; + p.ptr = ptr; + p.asInt[0] = warp_uniform(p.asInt[0]); + p.asInt[1] = warp_uniform(p.asInt[1]); + return p.ptr; +} +} // namespace gemm_kernel_utils + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/default_warp_iterator_from_smem.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/default_warp_iterator_from_smem.h new file mode 100644 index 0000000000000000000000000000000000000000..e0892db259b2f8aadec4ee4bb5615b6224ff5907 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/default_warp_iterator_from_smem.h @@ -0,0 +1,148 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Instantiates the right WarpIterator to read from shared memory + The class `DefaultWarpIteratorAFromSharedMemory` is useful when reading + data dumped with `B2bGemm::accumToSmem`. +*/ + +#pragma once + +#include +#include +#include + +#include + +namespace cutlass { +namespace gemm { +namespace threadblock { + +template < + typename WarpShape, + typename InstructionShape, + typename RegularWarpIterator, + typename Policy, + typename Enable = void> +struct DefaultWarpIteratorAFromSharedMemory {}; + +// TensorOp - Ampere half +template +struct DefaultWarpIteratorAFromSharedMemory< + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, kInstrK>, + RegularWarpIterator, + Policy, + typename platform::enable_if<( + sizeof_bits::value == 16 && + Policy::Operator::Policy::OpDelta::kRow == 1)>::type> { + using OpDelta = typename Policy::Operator::Policy::OpDelta; + using WarpShape = cutlass::MatrixShape<32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, kInstrK>; + + using WarpIterator = cutlass::gemm::warp::WarpIteratorFromSmem< + cutlass::gemm::Operand::kA, + typename RegularWarpIterator::Element, + cutlass::MatrixShape>; +}; + +// TensorOp - Ampere f32 +template +struct DefaultWarpIteratorAFromSharedMemory< + WarpShape, + cutlass::gemm::GemmShape<16, 8, 8>, + RegularWarpIterator, + Policy, + typename platform::enable_if<( + sizeof_bits::value != 16 || + Policy::Operator::Policy::OpDelta::kRow != 1)>::type> { + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + static constexpr auto kWarpSize = 32; + using OpDelta = typename Policy::Operator::Policy::OpDelta; + + using WarpIterator = + cutlass::gemm::warp::MmaTensorOpMultiplicandTileAccessIterator< + cutlass::MatrixShape, + cutlass::gemm::Operand::kA, + typename RegularWarpIterator::Element, + cutlass::layout::RowMajor, + cutlass::MatrixShape, + OpDelta::kRow, + kWarpSize>; +}; + +// TensorOp - Volta +template +struct DefaultWarpIteratorAFromSharedMemory< + WarpShape, + cutlass::gemm::GemmShape<16, 16, 4>, + RegularWarpIterator, + Policy> { + using InstructionShape = cutlass::gemm::GemmShape<16, 16, 4>; + static constexpr auto kWarpSize = 32; + using OpDelta = typename Policy::Operator::Policy::OpDelta; + + using WarpIterator = + cutlass::gemm::warp::MmaVoltaTensorOpMultiplicandTileIterator< + cutlass::MatrixShape<32, 32>, // MatrixShape, + cutlass::gemm::Operand::kA, + typename RegularWarpIterator::Element, + cutlass::layout::RowMajorVoltaTensorOpMultiplicandCrosswise<16, 32>, + cutlass::MatrixShape<16, 4>, + OpDelta::kRow, + kWarpSize>; +}; + +// Simt +template +struct DefaultWarpIteratorAFromSharedMemory< + WarpShape, + cutlass::gemm::GemmShape<1, 1, 1>, + RegularWarpIterator, + Policy> { + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; + static constexpr auto kWarpSize = 32; + + // We just use the same iterator, as we reproduced the same shared-memory + // schema. Just modify it to handle non-complete tiles. + using WarpIterator = RegularWarpIterator; +}; + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h new file mode 100644 index 0000000000000000000000000000000000000000..ff985595ca81354f02bf3c9681ebf99887f3e26d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/epilogue_predicated_tile_iterator.h @@ -0,0 +1,757 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Epilogue iterator that supports prefetching + + Mostly copied from +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +//////////////////////////////////////////////////////////////////////////////// + +namespace epilogue { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator used to load and store output tile from global memory in +/// epilogue. +/// +/// Satisfies: ReadableTileIterator | PredicatedTileIterator | +/// ForwardTileIterator +/// +template < + typename ThreadMap_, ///< Thread map (concept: OutputTileThreadMap) + typename Element_, ///< Element data type + bool ScatterD = false, ///< Scatter D operand or not + bool UseCUDAStore = false> +class PredicatedTileIteratorPrefetch { + public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = Element_; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + static int const kThreads = ThreadMap::kThreads; + static int const kIterations = ThreadMap::Count::kTile; + + static_assert( + ThreadMap::Iterations::kRow > 0, + "ThreadMap::Iterations::kRow must be > 0"); + static_assert( + ThreadMap::Iterations::kGroup > 0, + "ThreadMap::Iterations::kGroup must be > 0"); + static_assert( + ThreadMap::Iterations::kCluster > 0, + "ThreadMap::Iterations::kCluster must be > 0"); + static_assert( + ThreadMap::Iterations::kColumn > 0, + "ThreadMap::Iterations::kColumn must be > 0"); + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray; + + // + // Parameters struct + // + + /// Uses a non-template class + struct Params : PredicatedTileIteratorParams { + using Base = PredicatedTileIteratorParams; + + CUTLASS_HOST_DEVICE + Params() {} + + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : PredicatedTileIteratorParams( + layout.stride(0) * int(sizeof(AccessType)) / kElementsPerAccess, + make_OutputTileThreadMapDesc()) {} + + CUTLASS_HOST_DEVICE + Params(Base const& base) : Base(base) {} + }; + + /// Mask object + struct Mask { + static int const kCount = ThreadMap::Iterations::kColumn; + + /// Predicate state + bool predicates[kCount]; + + // + // Mask + // + CUTLASS_HOST_DEVICE + Mask() { + enable(); + } + + ///< Efficiently disables all accesses guarded by mask + CUTLASS_HOST_DEVICE void clear() { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + predicates[i] = false; + } + } + + ///< CUTLASS_HOST_DEVICE enables all accesses guarded by mask + CUTLASS_DEVICE void enable() { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + predicates[i] = true; + } + } + }; + + private: + // + // Data members + // + + /// Parameters structure containing reference and precomputed state. + PredicatedTileIteratorParams params_; + + /// Byte-level pointer + uint8_t* byte_pointer_; + + /// Array of boolean values to contain steady-state predicates + Mask mask_; + + /// Extent of the matrix tile in rows + Index extent_row_; + + /// Extent of the matrix tile in rows + Index extent_column_; + + /// A thread's starting row position (assuming steady-state predicates have + /// been computed) + Index thread_start_row_; + + /// A thread's starting column + Index thread_start_column_; + + /// Internal state counter + int state_[3]; + + /// Scatter indices + int const* indices_; + + // + // Static asserts about internal strides + // + + static_assert(sizeof(extent_row_) == 4, "Expected 32b extents"); + static_assert(sizeof(thread_start_row_) == 4, "Expected 32b extents"); + static_assert( + sizeof(PredicatedTileIteratorParams::stride) == 8, + "Expected 64b strides"); + + private: + // + // Methods + // + + public: + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + PredicatedTileIteratorPrefetch( + PredicatedTileIteratorParams const& params, + Element* pointer, + TensorCoord extent, + int thread_idx, + TensorCoord threadblock_offset = TensorCoord(), + int const* indices = nullptr) + : params_(params), indices_(indices) { + TensorCoord thread_offset = + ThreadMap::initial_offset(thread_idx) + threadblock_offset; + + extent_row_ = extent.row(); + extent_column_ = extent.column(); + + thread_start_row_ = thread_offset.row(); + thread_start_column_ = thread_offset.column(); + + // Initialize predicates + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kColumn; ++c) { + mask_.predicates[c] = + ((thread_offset.column() + ThreadMap::Delta::kColumn * c) < + extent.column()); + } + + // Null pointer performs no accesses + if (!pointer) { + mask_.clear(); + } + + if (ScatterD && !indices) { + mask_.clear(); + } + + // Initialize pointer + byte_pointer_ = reinterpret_cast(pointer) + + LongIndex(thread_offset.row()) * LongIndex(params_.stride) + + LongIndex(thread_offset.column()) * sizeof(AccessType) / + kElementsPerAccess; + + if (ScatterD) { + byte_pointer_ = reinterpret_cast(pointer) + + LongIndex(thread_offset.column()) * sizeof(AccessType) / + kElementsPerAccess; + } + + // Initialize internal state counter + state_[0] = state_[1] = state_[2] = 0; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + byte_pointer_ += pointer_offset * sizeof_bits::value / 8; + } + + CUTLASS_DEVICE + void prefetch_all() { + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < kIterations; ++iter) { + prefetch(); + ++(*this); + } + } + + CUTLASS_DEVICE + void prefetch() { + uint8_t* byte_pointer = byte_pointer_; + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; + ++cluster) { + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + int row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; + + AccessType* memory_pointer = + reinterpret_cast(byte_pointer); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; + ++column) { + // on windows using unsigned long here gives the error + // error: asm operand type size(4) does not match + // type/size implied by constraint 'l' + uint64_t addr = (uint64_t)((void*)&memory_pointer + [column * ThreadMap::Delta::kColumn / + kElementsPerAccess]); + asm volatile("prefetch.global.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); + } + + if (row + 1 < ThreadMap::Iterations::kRow) { + if (!ScatterD) { + byte_pointer += params_.increment_row; + } + } + } + + if (group + 1 < ThreadMap::Iterations::kGroup) { + byte_pointer += params_.increment_group; + } + } + + if (cluster + 1 < ThreadMap::Iterations::kCluster) { + byte_pointer += params_.increment_cluster; + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, int64_t byte_offset) const { + uint8_t* byte_pointer = byte_pointer_; + AccessType* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; + ++cluster) { + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + int frag_row_idx = + (row + + ThreadMap::Iterations::kRow * + (group + ThreadMap::Iterations::kGroup * cluster)); + + int row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; + + bool row_guard = ((row_offset + thread_start_row_) < extent_row_); + + AccessType* memory_pointer = + reinterpret_cast(byte_pointer + byte_offset); + + if (ScatterD && row_guard) { + assert(indices_); + + memory_pointer = reinterpret_cast( + byte_pointer + byte_offset + + LongIndex(indices_[row_offset + thread_start_row_]) * + LongIndex(params_.stride)); + } + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; + ++column) { + bool guard = row_guard && mask_.predicates[column]; + + cutlass::arch::global_load( + frag_ptr + [frag_row_idx * ThreadMap::Iterations::kColumn + column], + (void*)&memory_pointer + [column * ThreadMap::Delta::kColumn / kElementsPerAccess], + guard); + } + + if (row + 1 < ThreadMap::Iterations::kRow) { + if (!ScatterD) { + byte_pointer += params_.increment_row; + } + } + } + + if (group + 1 < ThreadMap::Iterations::kGroup) { + byte_pointer += params_.increment_group; + } + } + + if (cluster + 1 < ThreadMap::Iterations::kCluster) { + byte_pointer += params_.increment_cluster; + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) const { + load_with_byte_offset(frag, 0); + } + + /// Stores a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, int64_t byte_offset) const { + uint8_t* byte_pointer = byte_pointer_; + AccessType const* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; + ++cluster) { + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + int frag_row_idx = + (row + + ThreadMap::Iterations::kRow * + (group + ThreadMap::Iterations::kGroup * cluster)); + + int row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; + + bool row_guard = ((row_offset + thread_start_row_) < extent_row_); + + AccessType* memory_pointer = + reinterpret_cast(byte_pointer + byte_offset); + + if (ScatterD && row_guard) { + assert(indices_); + + memory_pointer = reinterpret_cast( + byte_pointer + byte_offset + + LongIndex(indices_[row_offset + thread_start_row_]) * + LongIndex(params_.stride)); + } + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; + ++column) { + bool guard = row_guard && mask_.predicates[column]; + + if (UseCUDAStore) { + if (guard) { + memory_pointer + [column * ThreadMap::Delta::kColumn / kElementsPerAccess] = + frag_ptr + [frag_row_idx * ThreadMap::Iterations::kColumn + + column]; + } + } else { + cutlass::arch::global_store( + frag_ptr + [frag_row_idx * ThreadMap::Iterations::kColumn + column], + (void*)&memory_pointer + [column * ThreadMap::Delta::kColumn / kElementsPerAccess], + guard); + } + } + + if (row + 1 < ThreadMap::Iterations::kRow) { + if (!ScatterD) { + byte_pointer += params_.increment_row; + } + } + } + + if (group + 1 < ThreadMap::Iterations::kGroup) { + byte_pointer += params_.increment_group; + } + } + + if (cluster + 1 < ThreadMap::Iterations::kCluster) { + byte_pointer += params_.increment_cluster; + } + } + } + + /// Stores a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) const { + store_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void downsample_load_with_byte_offset( + Fragment& frag, + int64_t byte_offset, + int convolution_P, + int convolution_Q, + int add_P, + int add_Q, + int problem_N) const { + uint8_t* byte_pointer = byte_pointer_; + AccessType* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; + ++cluster) { + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + int frag_row_idx = + (row + + ThreadMap::Iterations::kRow * + (group + ThreadMap::Iterations::kGroup * cluster)); + + int row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; + + bool row_guard = ((row_offset + thread_start_row_) < extent_row_); + + int output_row = row_offset + thread_start_row_; + int output_N = output_row / (convolution_P * convolution_Q); + int output_PQ = output_row % (convolution_P * convolution_Q); + int output_P = output_PQ / convolution_Q; + int output_Q = output_PQ % convolution_Q; + + int input_row = output_N * 2 * convolution_P * 2 * convolution_Q + + (2 * output_P + add_P) * 2 * convolution_Q + 2 * output_Q + add_Q; + + int64_t byte_offset = + (input_row - output_row) * problem_N * sizeof(float); + + AccessType* memory_pointer = + reinterpret_cast(byte_pointer + byte_offset); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; + ++column) { + bool guard = row_guard && mask_.predicates[column]; + + cutlass::arch::global_load( + frag_ptr + [frag_row_idx * ThreadMap::Iterations::kColumn + column], + (void*)&memory_pointer + [column * ThreadMap::Delta::kColumn / kElementsPerAccess], + guard); + } + + if (row + 1 < ThreadMap::Iterations::kRow) { + byte_pointer += params_.increment_row; + } + } + + if (group + 1 < ThreadMap::Iterations::kGroup) { + byte_pointer += params_.increment_group; + } + } + + if (cluster + 1 < ThreadMap::Iterations::kCluster) { + byte_pointer += params_.increment_cluster; + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void upsample_load_with_byte_offset( + Fragment& frag, + int64_t byte_offset, + int convolution_P, + int convolution_Q, + int add_P, + int add_Q, + int problem_N) const { + uint8_t* byte_pointer = byte_pointer_; + AccessType* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; + ++cluster) { + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + int frag_row_idx = + (row + + ThreadMap::Iterations::kRow * + (group + ThreadMap::Iterations::kGroup * cluster)); + + int row_offset = row * ThreadMap::Delta::kRow + + group * ThreadMap::Delta::kGroup + + cluster * ThreadMap::Delta::kCluster; + + bool row_guard = ((row_offset + thread_start_row_) < extent_row_); + + int output_row = row_offset + thread_start_row_; + int output_N = output_row / (convolution_P * convolution_Q); + int output_PQ = output_row % (convolution_P * convolution_Q); + int output_P = output_PQ / convolution_Q; + int output_Q = output_PQ % convolution_Q; + int row_add_P = add_P; + int row_add_Q = add_Q; + if (output_P > convolution_P - 2) + row_add_P = 0; + if (output_Q > convolution_Q - 2) + row_add_Q = 0; + + int input_row = output_N * (convolution_P / 2) * (convolution_Q / 2) + + ((output_P + row_add_P) / 2) * (convolution_Q / 2) + + (output_Q + row_add_Q) / 2; + + int64_t byte_offset = + (input_row - output_row) * problem_N * sizeof(float); + + AccessType* memory_pointer = + reinterpret_cast(byte_pointer + byte_offset); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; + ++column) { + bool guard = row_guard && mask_.predicates[column]; + + cutlass::arch::global_load( + frag_ptr + [frag_row_idx * ThreadMap::Iterations::kColumn + column], + (void*)&memory_pointer + [column * ThreadMap::Delta::kColumn / kElementsPerAccess], + guard); + } + + if (row + 1 < ThreadMap::Iterations::kRow) { + byte_pointer += params_.increment_row; + } + } + + if (group + 1 < ThreadMap::Iterations::kGroup) { + byte_pointer += params_.increment_group; + } + } + + if (cluster + 1 < ThreadMap::Iterations::kCluster) { + byte_pointer += params_.increment_cluster; + } + } + } + + CUTLASS_DEVICE + MatrixCoord thread_start() const { + return MatrixCoord(thread_start_row_, thread_start_column_); + } + + /// Need to get the thread start row from the tile iterator + CUTLASS_DEVICE + int32_t thread_start_row() const { + return thread_start_row_; + } + + /// Need to get the thread start row from the tile iterator + CUTLASS_DEVICE + int32_t thread_start_column() const { + return thread_start_column_; + } + + /// Extent of the matrix in rows + CUTLASS_DEVICE + Index extent_row() const { + return extent_row_; + } + + /// Extent of the matrix in columns + CUTLASS_DEVICE + Index extent_column() const { + return extent_column_; + } + + /// Advances to the next position to load or store + CUTLASS_HOST_DEVICE + PredicatedTileIteratorPrefetch& operator++() { + ++state_[0]; + + if (!ScatterD) { + byte_pointer_ += params_.advance_row; + } + + thread_start_row_ += ThreadMap::Shape::kRow; + + if (state_[0] == ThreadMap::Count::kRow) { + state_[0] = 0; + ++state_[1]; + byte_pointer_ += params_.advance_group; + + thread_start_row_ += (ThreadMap::Shape::kGroup - 1) * + ThreadMap::Shape::kRow * ThreadMap::Count::kRow; + + if (state_[1] == ThreadMap::Count::kGroup) { + state_[1] = 0; + ++state_[2]; + byte_pointer_ += params_.advance_cluster; + + thread_start_row_ += ThreadMap::Count::kGroup * + ThreadMap::Shape::kGroup * ThreadMap::Count::kRow * + ThreadMap::Shape::kRow; + + if (state_[2] == ThreadMap::Count::kCluster) { + state_[2] = 0; + byte_pointer_ += params_.advance_tile; + } + } + } + + return *this; + } + + ///< Efficiently disables all accesses guarded by mask + CUTLASS_DEVICE void clear_mask() { + mask_.clear(); + } + + ///< Efficiently enables all accesses guarded by mask + CUTLASS_DEVICE void enable_mask() { + mask_.enable(); + } + + ///< Sets the mask + CUTLASS_DEVICE void get_mask(Mask& mask) const { + mask = mask_; + } + + ///< Sets the mask + CUTLASS_DEVICE void set_mask(Mask const& mask) { + mask_ = mask; + } +}; + +template +struct MakePrefetchableIterator { + using Iterator = PredicatedTileIteratorPrefetch< + typename IT::ThreadMap, + typename IT::Element>; +}; + +/////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/make_residual_last.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/make_residual_last.h new file mode 100644 index 0000000000000000000000000000000000000000..5c8e84bbf52ae598f607309077ed369cea5e1327 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/make_residual_last.h @@ -0,0 +1,79 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include + + +namespace cutlass { +namespace transform { +namespace threadblock { + +template +struct MakeIteratorResidualLast; + +template < + typename Shape, + typename Element, + typename Layout, + int AdvanceRank, + typename ThreadMap, + int AccessSize, + bool Gather> +struct MakeIteratorResidualLast> { + using Iterator = PredicatedTileIteratorResidualLast< + Shape, + Element, + Layout, + AdvanceRank, + ThreadMap, + AccessSize, + Gather>; +}; + +template < + typename Shape, + typename Element, + typename Layout, + int AdvanceRank, + typename ThreadMap, + typename AccessType, + bool Gather> +struct MakeIteratorResidualLast> { + using Iterator = PredicatedTileAccessIteratorResidualLast< + Shape, + Element, + Layout, + AdvanceRank, + ThreadMap, + AccessType, + Gather>; +}; +} // namespace threadblock +} // namespace transform +} // namespace cutlass + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h new file mode 100644 index 0000000000000000000000000000000000000000..1b1f9f6917b428f129a668591804a73b9f22592c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_access_iterator_residual_last.h @@ -0,0 +1,2120 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates calculating the address and predicates to the load of tiles + from pitch-linear rank=2 tensors. + + This iterator uses masks to guard out-of-bounds accesses. The first tile + this iterator visits maybe partial, then the remaining tiles are complete. + So, we only need to compute the predicates twice, once before the first tile + and once for the remaining full tiles which can share the same predicates. + + A precomputed "Params" object minimizes the amount of state that must be + stored in registers, and integer addition is used to advance the pointer + through memory. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace transform { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// PredicatedTileAccessIteratorResidualLast +/// +template < + typename Shape, + typename Element, + typename Layout, + int AdvanceRank, + typename ThreadMap, + typename AccessType, + bool Gather = false> +class PredicatedTileAccessIteratorResidualLast; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for pitch-linear +/// data. +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_, + bool Gather> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::PitchLinear, + AdvanceRank, + ThreadMap_, + AccessType_, + Gather> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::PitchLinear; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates< + Shape, + Element, + Layout, + AdvanceRank, + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = + ThreadMap::kElementsPerAccess / AccessType::kElements; + + static_assert( + !(ThreadMap::kElementsPerAccess % AccessType::kElements), + "Vectors implied by the thread map must be divisible by the access type."); + + using Mask = typename UnderlyingPredicates::Mask; + + /// Uses a non-template class + struct Params : PredicatedTileAccessIteratorParams { + using Base = PredicatedTileAccessIteratorParams; + + // Default ctor + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : Base( + layout.stride(0), + MakePredicatedTileAccessIteratorDesc< + Shape, + Element, + Layout, + kAdvanceRank, + ThreadMap>()()) {} + + CUTLASS_HOST_DEVICE + Params(Base const& base) : Base(base) {} + }; + + private: + /// Internal pointer type permits fast address arithmetic + using BytePointer = char*; + + private: + // + // Data members + // + + UnderlyingPredicates the_predicates; + Mask residual_tile_mask; + + /// Parameters object with precomputed internal state + Params params_; + + /// Internal pointer to first access of tile + BytePointer pointer_; + + /// Below is used when Gather is turned on. We need to record strided_offset + /// and contiguous_offset separated to compute the offset by using + /// + /// offset = contiguous_offset + indices[strided_offset] + /// + + /// Gather indices + int const* indices_; + + Index gather_offset_strided; + + private: + /// Computes predicates based on internally tracked per-thread offset. + CUTLASS_DEVICE + void compute_predicates_( + /// Extent of the matrix window + TensorCoord extent, + /// optionally, simplify predicate calculation during 'steady state' phase + bool is_steady_state = false) { + the_predicates.compute_predicates_(extent, is_steady_state); + } + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + /// Gather indices + int const* indices = nullptr) + : params_(params), + pointer_(reinterpret_cast( + const_cast(pointer))), + the_predicates(extent), + indices_(indices) { + the_predicates.set_predicates(thread_id, threadblock_offset); + the_predicates.get_mask(residual_tile_mask); + + // Working around a weird compiler bug happening on P100 for the backward. + // I've seen together: the_predicates.predicates_[0] = 14 (instead of 15) + // residual_tile_mask[0] = 15 (correct) + // + // Adding prints when the value is calculated (in `compute_predicates_`) + // sometimes removes the bug. The consequence is that we skip some + // element of a tensor, leading to wrong results + // Setting `compute_predicates_`'s second argument (`is_steady_state`) to + // true also seems to get rid of the bug - at the cost of twice as many + // comparisons. +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) + constexpr bool kWorkAroundCompilerBug = false; +#else + constexpr bool kWorkAroundCompilerBug = true; +#endif + the_predicates.compute_predicates_(extent, true && !kWorkAroundCompilerBug); + + // update internal pointers + Layout layout(params_.stride_); + + if (!Gather) { + add_pointer_offset(layout(the_predicates.thread_offset_)); + } else { + gather_offset_strided = the_predicates.thread_offset_.strided(); + add_pointer_offset( + layout(make_Coord(the_predicates.thread_offset_.contiguous(), 0))); + } + } + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + ///< ID of each participating thread + int thread_id) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + the_predicates.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool is_residual_tile) { + if (is_residual_tile) { + the_predicates.set_mask(residual_tile_mask); + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += sizeof_bits::value * pointer_offset / 8; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + if (!Gather) { + if (kAdvanceRank) { + pointer_ += params_.inc_advance_ * LongIndex(tile_offset.strided()); + pointer_ += Shape::kContiguous * tile_offset.contiguous(); + } else { + pointer_ += params_.inc_advance_ * LongIndex(tile_offset.contiguous()); + pointer_ += Shape::kStrided * tile_offset.strided(); + } + } else { + add_pointer_offset(Shape::kContiguous * tile_offset.contiguous()); + gather_offset_strided += Shape::kStrided * tile_offset.strided(); + } + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + if (Gather) { + assert(indices_); + + if (!valid()) { + return nullptr; + } + + LongIndex contiguous_offset = the_predicates.iteration_contiguous_ * + (ThreadMap::Delta::kContiguous * sizeof_bits::value / + 8) + + the_predicates.iteration_vector_; + int strided_index = gather_offset_strided + + the_predicates.iteration_strided_ * ThreadMap::Delta::kStrided; + + LongIndex strided_offset = indices_[strided_index] * + LongIndex(params_.stride_) * sizeof_bits::value / 8; + + return reinterpret_cast( + pointer_ + contiguous_offset + strided_offset); + } + + return reinterpret_cast( + pointer_ + + the_predicates.iteration_contiguous_ * + (ThreadMap::Delta::kContiguous * + sizeof_bits::value) / + 8) + + the_predicates.iteration_vector_; + } + + /// Increment and return an instance to self. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + the_predicates.operator++(); + + ++the_predicates.iteration_vector_; + if (the_predicates.iteration_vector_ < kAccessesPerVector) { + return *this; + } + + the_predicates.iteration_vector_ = 0; + ++the_predicates.iteration_contiguous_; + + if (the_predicates.iteration_contiguous_ < + ThreadMap::Iterations::kContiguous) { + return *this; + } + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + the_predicates.iteration_contiguous_ = 0; + ++the_predicates.iteration_strided_; + + if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) { + if (!Gather) { + pointer_ += params_.inc_strided_; + } + + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + the_predicates.iteration_strided_ = 0; + + if (!Gather) { + // advance to next tile + pointer_ += params_.inc_next_; + + // now return to start tile - if the iterator is subsequently advanced, + // this subtraction as well as the subsequent integer addition are both + // elided by the compiler. + pointer_ -= params_.inc_advance_; + } + + return *this; + } + + /// Increment and return an instance to self. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + the_predicates.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + the_predicates.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + the_predicates.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + the_predicates.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() const { + return the_predicates.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major +/// data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_, + bool Gather> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::ColumnMajor, + AdvanceRank, + ThreadMap_, + AccessType_, + Gather> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 0 : 1), + ThreadMap, + AccessType, + Gather>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileAccessIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + /// Default ctor + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))){}; + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + ///< Precomputed parameters object + Params const& params, + ///< Pointer to start of tensor + Pointer pointer, + ///< Extent of tensor + TensorCoord extent, + ///< ID of each participating thread + int thread_id, + ///< Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.row(), extent.column()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.row(), + threadblock_offset.column()), + indices) {} + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return iterator_.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major +/// data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_, + bool Gather> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::RowMajor, + AdvanceRank, + ThreadMap_, + AccessType_, + Gather> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 1 : 0), + ThreadMap, + AccessType, + Gather>; + + static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileAccessIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + /// Default ctor + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))){}; + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + ///< Precomputed parameters object + Params const& params, + ///< Pointer to start of tensor + Pointer pointer, + ///< Extent of tensor + TensorCoord extent, + ///< ID of each participating thread + int thread_id, + ///< Initial offset of threadblock + TensorCoord const& threadblock_offset, + /// Gather indices + int const* indices = nullptr) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.column(), extent.row()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.column(), + threadblock_offset.row()), + indices) {} + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return iterator_.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2 +/// data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::AffineRankN<2>, + AdvanceRank, + ThreadMap_, + AccessType_, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::AffineRankN<2>; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingPredicates = PredicatedTileAccessIteratorPredicates< + Shape, + Element, + layout::PitchLinear, + AdvanceRank, + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = + ThreadMap::kElementsPerAccess / AccessType::kElements; + + static_assert( + !(ThreadMap::kElementsPerAccess % AccessType::kElements), + "Vectors implied by the thread map must be divisible by the access type."); + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingPredicates::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + public: + friend PredicatedTileAccessIteratorResidualLast; + + private: + /// stride of pitch-linear layout (units of Element) + Coord stride_; + /// amount (in byte) to increment pointer to move to next access along + /// contiguous dimension + LongIndex inc_contiguous_; + /// amount (in byte) to increment pointer from first access of current + /// contiguous dimension to first access of next one. + LongIndex inc_strided_; + /// amount (in byte) to increment pointer from last access of current + /// contiguous dimension to first access of next one. + LongIndex inc_next_strided_; + /// amount (in byte) to increment pointer from last access to first access + /// of next tile + LongIndex inc_next_; + /// amount (in byte) to increment pointer from first access of current tile + /// to first access of next tile + LongIndex inc_advance_; + + public: + // Default ctor + CUTLASS_HOST_DEVICE + Params() + : stride_(0), + inc_contiguous_(0), + inc_strided_(0), + inc_next_(0), + inc_advance_(0) {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : stride_({layout.stride(0), layout.stride(1)}) { + inc_contiguous_ = + (LongIndex(stride_[0]) * ThreadMap::Delta::kContiguous) * + sizeof_bits::value / 8; + + inc_strided_ = (LongIndex(stride_[1]) * ThreadMap::Delta::kStrided) * + sizeof_bits::value / 8; + + inc_next_strided_ = inc_strided_ - + LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_; + + if (kAdvanceRank) { + // advance along strided dimension + inc_advance_ = Shape::kStrided * LongIndex(stride_[1]) * + sizeof_bits::value / 8; + } else { + // advance along contiguous dimension + inc_advance_ = + Shape::kContiguous * stride_[0] * sizeof_bits::value / 8; + } + + inc_next_ = inc_advance_ - + LongIndex(ThreadMap::Iterations::kContiguous - 1) * inc_contiguous_ - + LongIndex(ThreadMap::Iterations::kStrided - 1) * inc_strided_; + }; + }; + + private: + /// Internal pointer type permits fast address arithmetic + using BytePointer = char*; + + // + // Data members + // + + /// Parameters object with precomputed internal state + Params params_; + + /// Internal pointer to first access of tile + BytePointer pointer_; + + UnderlyingPredicates the_predicates; + Mask residual_tile_mask; + + private: + /// Computes predicates based on internally tracked per-thread offset. + CUTLASS_DEVICE + void compute_predicates_( + /// Extent of the matrix window + TensorCoord extent, + /// optionally, simplify predicate calculation during 'steady state' phase + bool is_steady_state = false) { + the_predicates.compute_predicates_(extent, is_steady_state); + } + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + ///< Precomputed parameters object + Params const& params, + ///< Pointer to start of tensor + Pointer pointer, + ///< Extent of tensor + TensorCoord extent, + ///< ID of each participating thread + int thread_id, + ///< Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : params_(params), + pointer_(reinterpret_cast( + const_cast(pointer))), + the_predicates(extent) { + the_predicates.set_predicates(thread_id, threadblock_offset); + + // update internal pointers + Layout layout(params_.stride_); + add_pointer_offset(layout(the_predicates.thread_offset_)); + } + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + the_predicates.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool is_residual_tile) { + if (is_residual_tile) { + the_predicates.set_mask(residual_tile_mask); + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + pointer_ += sizeof_bits::value * pointer_offset / 8; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + if (kAdvanceRank) { + pointer_ += params_.inc_advance_ * LongIndex(tile_offset[1]); + pointer_ += Shape::kContiguous * tile_offset[0]; + } else { + pointer_ += params_.inc_advance_ * LongIndex(tile_offset[0]); + pointer_ += Shape::kStrided * tile_offset[1]; + } + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(pointer_) + + the_predicates.iteration_vector_; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + the_predicates.operator++(); + ++the_predicates.iteration_vector_; + if (the_predicates.iteration_vector_ < kAccessesPerVector) { + return *this; + } + + the_predicates.iteration_vector_ = 0; + ++the_predicates.iteration_contiguous_; + + if (the_predicates.iteration_contiguous_ < + ThreadMap::Iterations::kContiguous) { + pointer_ += params_.inc_contiguous_; + return *this; + } + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + the_predicates.iteration_contiguous_ = 0; + ++the_predicates.iteration_strided_; + + if (the_predicates.iteration_strided_ < ThreadMap::Iterations::kStrided) { + pointer_ += params_.inc_next_strided_; + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + the_predicates.iteration_strided_ = 0; + + // advance to next tile + pointer_ += params_.inc_next_; + + // now return to start tile - if the iterator is subsequently advanced, this + // subtraction as well as the subsequent integer addition are both elided by + // the compiler. + pointer_ -= params_.inc_advance_; + + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + the_predicates.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + the_predicates.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + the_predicates.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + the_predicates.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return the_predicates.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank 2 +/// column-major data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::AffineRank2ColumnMajor, + AdvanceRank, + ThreadMap_, + AccessType_, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::AffineRank2ColumnMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + // Map to the underlying AffineRankN<2> layout + using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::AffineRankN<2>, + (kAdvanceRank == 0 ? 0 : 1), + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileAccessIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + /// Default ctor + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given an AffineRankN<2> tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))){}; + }; + + private: + // + // Data members + // + + /// Underlying AffineRankN<2> tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + ///< Precomputed parameters object + Params const& params, + ///< Pointer to start of tensor + Pointer pointer, + ///< Extent of tensor + TensorCoord extent, + ///< ID of each participating thread + int thread_id, + ///< Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.row(), extent.column()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.row(), + threadblock_offset.column())) {} + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + iterator_.add_tile_offset( + make_Coord(tile_offset.row(), tile_offset.column())); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return iterator_.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for affine rank-2 +/// row-major data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::AffineRank2RowMajor, + AdvanceRank, + ThreadMap_, + AccessType_, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::AffineRank2RowMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + // Map to the underlying AffineRankN<2> layout + using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::AffineRankN<2>, + (kAdvanceRank == 0 ? 1 : 0), + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileAccessIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + /// Default ctor + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given an AffineRankN<2> tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))){}; + }; + + private: + // + // Data members + // + + /// Underlying AffineRankN<2> tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + ///< Precomputed parameters object + Params const& params, + ///< Pointer to start of tensor + Pointer pointer, + ///< Extent of tensor + TensorCoord extent, + ///< ID of each participating thread + int thread_id, + ///< Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.column(), extent.row()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.column(), + threadblock_offset.row())) {} + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + iterator_.add_tile_offset( + make_Coord(tile_offset.column(), tile_offset.row())); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return iterator_.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for column-major +/// interleaved data. It is mapped to the congruous layout. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// + +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_, + int InterleavedK> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::ColumnMajorInterleaved, + AdvanceRank, + ThreadMap_, + AccessType_, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + static int const kInterleavedK = InterleavedK; + using Layout = layout::ColumnMajorInterleaved; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast< + layout::PitchLinearShape< + Shape::kRow * kInterleavedK, + Shape::kColumn / kInterleavedK>, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 0 : 1), + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileAccessIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))) {} + + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord( + extent.row() * kInterleavedK, + extent.column() / kInterleavedK), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.row() * kInterleavedK, + threadblock_offset.column() / kInterleavedK)) {} + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return iterator_.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileAccessIteratorResidualLast for row-major +/// interleaved data. +// It is mapped to the congruous layout. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + typename AccessType_, + int InterleavedK> +class PredicatedTileAccessIteratorResidualLast< + Shape_, + Element_, + layout::RowMajorInterleaved, + AdvanceRank, + ThreadMap_, + AccessType_, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + static int const kInterleavedK = InterleavedK; + using Layout = layout::RowMajorInterleaved; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + using AccessType = AccessType_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileAccessIteratorResidualLast< + layout::PitchLinearShape< + Shape::kColumn * kInterleavedK, + Shape::kRow / kInterleavedK>, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 1 : 0), + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = UnderlyingIterator::kAccessesPerVector; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileAccessIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))) {} + + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord( + extent.column() * kInterleavedK, + extent.row() / kInterleavedK), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.column() * kInterleavedK, + threadblock_offset.row() / kInterleavedK)) {} + + /// Construct a PredicatedTileAccessIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileAccessIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + void add_tile_offset(TensorCoord const& tile_offset) { + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType* get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileAccessIteratorResidualLast operator++(int) { + PredicatedTileAccessIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Returns whether access is valid or not + CUTLASS_HOST_DEVICE + bool valid() { + return iterator_.valid(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace transform +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h new file mode 100644 index 0000000000000000000000000000000000000000..0f1f5a2f63f12a0537ced07872ea81121315f9d4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/predicated_tile_iterator_residual_last.h @@ -0,0 +1,2125 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing loading of tiles from pitch-linear rank=2 + tensors. + + This iterator uses masks to guard out-of-bounds accesses. The first tile + this iterator visits maybe partial, then the remaining tiles are complete. + So, we only need to compute the predicates twice, once before the first tile + and once for the remaining full tiles which can share the same predicates. + + A precomputed "Params" object minimizes the amount of state that must be + stored in registers, and integer addition is used to advance the pointer + through memory. +*/ + +#pragma once + +#include +#include + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace transform { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// PredicatedTileIteratorResidualLast +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +/// Regular tile iterator using a precomputed control structure to minimize +/// register liveness and integer arithmetic. +/// +/// Layout is assumed to be invariant at the time the precomputed "Params" +/// object is constructed. +/// +/// Base pointer and tensor extents may be specified at the time the iterator is +/// constructed. Subsequently, they are assumed to be immutable. +/// +/// Adding a logical coordinate offset may be performed at the time the iterator +/// is constructed. Subsequent additions to logical coordinate offset may be +/// performed but are relatively expensive. +/// +/// Visitation order is intended to first visit a "residual" tile that may be +/// partially full in both the advance dimension and the steady-state dimension. +/// This is assumed to be the last tile in the iteration sequence. Advancing an +/// iterator that has just been constructed moves to the first tile that is full +/// in the advance dimension and recomputes predicates. Subsequent accesses may +/// be performed without updating internal predicates and are efficient in terms +/// of live register state and pointer arithmetic instructions. +/// +/// To be efficient, this assumes the iterator will be dereferenced and advanced +/// at least once outside any looping structure to minimize integer arithmetic. +/// +/// Access out of bounds are safe so long as `clear_mask()` is called prior to +/// dereferencing the iterator. +/// +/// +/// Example: +/// +/// An efficient pipeline structure may be constructed as follows: +/// +// template +// __global__ void kernel( +// typename Iterator::Params params, +// typename Iterator::Element *ptr, +// TensorCoord extent) { +// +// typename Iterator::Fragment fragment; +// +// TensorCoord threadblock_offset(0, 0); +// +// Iterator iter(params, ptr, extent, threadIdx.x, threadblock_offsets); +// +// +// fragment = *iter; // load "residue" tile first +// ++iter; // advance to first "steady state" tile and update +// internal masks +// +// +// #pragma unroll +// for (int i = Remaining - 1; i >= 0; --i) { +// +// f(fragment); +// +// if (!i) { +// iter.clear_mask(); // light-weight operation to clear masks - +// subsequent loads become NO-OPs. +// } +// +// fragment = *iter; // load tile during "steady state" phase +// ++iter; // advance to next tile - lightweight due to +// steady-state masks +// } +// } +// +// void host(TensorView view) { +// +// using Iterator = +// transform::threadblock::PredicatedTileIteratorResidualLast; +// +// typename Iterator::Params params(view.layout()); +// +// kernel(params, view.data()); +// } +/// +/// +template < + typename Shape, + typename Element, + typename Layout, + int AdvanceRank, + typename ThreadMap, + int AccessSize = ThreadMap::kElementsPerAccess, + bool Gather = false> +class PredicatedTileIteratorResidualLast; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize, + bool Gather> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::PitchLinear, + AdvanceRank, + ThreadMap_, + AccessSize, + Gather> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::PitchLinear; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + /// Type used for internal memory accesses + using AccessType = AlignedArray< + Element, + AccessSize, + (AccessSize * sizeof_bits::value / 8)>; + + /// Underlying iterator to compute the addresses + using TileAccessIterator = PredicatedTileAccessIteratorResidualLast< + Shape, + Element, + Layout, + kAdvanceRank, + ThreadMap, + AccessType, + Gather>; + + static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename TileAccessIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + public: + using Base = typename TileAccessIterator::Params::Base; + + friend PredicatedTileIteratorResidualLast; + + private: + /// Parameters object + typename TileAccessIterator::Params params_; + + public: + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) : params_(layout) {} + + CUTLASS_HOST_DEVICE + Params() {} + + CUTLASS_HOST_DEVICE + Params(Base const& base) : params_(base) {} + }; + + private: + /// Internal pointer type permits fast address arithmetic + using BytePointer = char*; + + private: + // + // Data members + // + + /// Data member to the tile access iterator + TileAccessIterator address_iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + /// Gather indices + int const* indices = nullptr) + : address_iterator_( + params.params_, + pointer, + extent, + thread_id, + threadblock_offset, + indices) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + address_iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + if (kAdvanceRank) + address_iterator_.add_tile_offset({0, 1}); + else + address_iterator_.add_tile_offset({1, 0}); + + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + address_iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + address_iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + address_iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + address_iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + address_iterator_.get_mask(mask); + } + + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + load_with_byte_offset( + frag, pointer_offset * sizeof_bits::value / 8); + } + + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) { + AccessType* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kAccessesPerVector; ++v) { + int idx = v + + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous); + + address_iterator_.set_iteration_index(idx); + char const* byte_ptr = + reinterpret_cast(address_iterator_.get()) + + byte_offset; + + AccessType const* access_ptr = + reinterpret_cast(byte_ptr); + + cutlass::arch::global_load( + frag_ptr[idx], access_ptr, address_iterator_.valid()); + + ++address_iterator_; + } + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_byte_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + store_with_byte_offset( + frag, pointer_offset * sizeof_bits::value / 8); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) { + address_iterator_.set_iteration_index(0); + AccessType const* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kAccessesPerVector; ++v) { + int idx = v + + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous); + + char* byte_ptr = + reinterpret_cast(address_iterator_.get()) + byte_offset; + AccessType* access_ptr = reinterpret_cast(byte_ptr); + + if (address_iterator_.valid()) { + *access_ptr = frag_ptr[idx]; + } + ++address_iterator_; + } + } + } + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_byte_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize, + bool Gather> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::ColumnMajor, + AdvanceRank, + ThreadMap_, + AccessSize, + Gather> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 0 : 1), + ThreadMap, + AccessSize, + Gather>; + + using AccessType = typename UnderlyingIterator::AccessType; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))) {} + + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id, ///< ID of each participating thread + TensorCoord const& threadblock_offset, ///< Initial offset of threadblock + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.row(), extent.column()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.row(), + threadblock_offset.column()), + indices) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_pointer_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) { + iterator_.store_with_byte_offset(frag, byte_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_pointer_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for pitch-linear data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize, + bool Gather> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::RowMajor, + AdvanceRank, + ThreadMap_, + AccessSize, + Gather> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 1 : 0), + ThreadMap, + AccessSize, + Gather>; + + using AccessType = typename UnderlyingIterator::AccessType; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))) {} + + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id, ///< ID of each participating thread + TensorCoord const& threadblock_offset, ///< Initial offset of threadblock + int const* indices = nullptr ///< Gather indices + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.column(), extent.row()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.column(), + threadblock_offset.row()), + indices) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_pointer_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) { + iterator_.store_with_byte_offset(frag, byte_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_pointer_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for affine rank-2 data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::AffineRankN<2>, + AdvanceRank, + ThreadMap_, + AccessSize, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::AffineRankN<2>; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + /// Type used for internal memory accesses + using AccessType = AlignedArray< + Element, + AccessSize, + (AccessSize * sizeof_bits::value / 8)>; + + /// Underlying iterator to compute the addresses + using TileAccessIterator = PredicatedTileAccessIteratorResidualLast< + Shape, + Element, + Layout, + kAdvanceRank, + ThreadMap, + AccessType>; + + static int const kAccessesPerVector = TileAccessIterator::kAccessesPerVector; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename TileAccessIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + public: + friend PredicatedTileIteratorResidualLast; + + private: + /// Parameters object + typename TileAccessIterator::Params params_; + + public: + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) : params_(layout) {} + + CUTLASS_HOST_DEVICE + Params() {} + }; + + private: + /// Internal pointer type permits fast address arithmetic + using BytePointer = char*; + + private: + // + // Data members + // + + /// Data member to the tile access iterator + TileAccessIterator address_iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : address_iterator_( + params.params_, + pointer, + extent, + thread_id, + threadblock_offset) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + address_iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + if (kAdvanceRank) + address_iterator_.add_tile_offset(make_Coord(0, 1)); + else + address_iterator_.add_tile_offset(make_Coord(1, 0)); + + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + address_iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + address_iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + address_iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + address_iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + address_iterator_.get_mask(mask); + } + + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + load_with_byte_offset( + frag, pointer_offset * sizeof_bits::value / 8); + } + + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) { + AccessType* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kAccessesPerVector; ++v) { + int idx = v + + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous); + + address_iterator_.set_iteration_index(idx); + char const* byte_ptr = + reinterpret_cast(address_iterator_.get()) + + byte_offset; + + AccessType const* access_ptr = + reinterpret_cast(byte_ptr); + + cutlass::arch::global_load( + frag_ptr[idx], access_ptr, address_iterator_.valid()); + + ++address_iterator_; + } + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_byte_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + store_with_byte_offset( + frag, pointer_offset * sizeof_bits::value / 8); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) { + address_iterator_.set_iteration_index(0); + AccessType const* frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kAccessesPerVector; ++v) { + int idx = v + + kAccessesPerVector * (c + s * ThreadMap::Iterations::kContiguous); + + char* byte_ptr = + reinterpret_cast(address_iterator_.get()) + byte_offset; + AccessType* access_ptr = reinterpret_cast(byte_ptr); + + if (address_iterator_.valid()) { + *access_ptr = frag_ptr[idx]; + } + ++address_iterator_; + } + } + } + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_byte_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2 +/// column-major data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::AffineRank2ColumnMajor, + AdvanceRank, + ThreadMap_, + AccessSize, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::AffineRank2ColumnMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + // Map to the underlying AffineRankN<2> layout + using UnderlyingIterator = PredicatedTileIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::AffineRankN<2>, + (kAdvanceRank == 0 ? 0 : 1), + ThreadMap, + AccessSize>; + + using AccessType = typename UnderlyingIterator::AccessType; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given an AffineRankN<2> tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::AffineRankN<2>(layout.stride(0), layout.stride(1))) {} + }; + + private: + // + // Data members + // + + /// Underlying AffineRankN<2> tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id, ///< ID of each participating thread + TensorCoord const& threadblock_offset, ///< Initial offset of threadblock + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.row(), extent.column()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.row(), + threadblock_offset.column())) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_pointer_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) { + iterator_.store_with_byte_offset(frag, byte_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_pointer_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for affine rank 2 +/// row-major data. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::AffineRank2RowMajor, + AdvanceRank, + ThreadMap_, + AccessSize, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::AffineRank2RowMajor; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + // Map to the underlying AffineRankN<2> layout + using UnderlyingIterator = PredicatedTileIteratorResidualLast< + layout::PitchLinearShape, + Element, + layout::AffineRankN<2>, + (kAdvanceRank == 0 ? 1 : 0), + ThreadMap, + AccessSize>; + + using AccessType = typename UnderlyingIterator::AccessType; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given an AffineRankN<2> tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::AffineRankN<2>(layout.stride(1), layout.stride(0))) {} + }; + + private: + // + // Data members + // + + /// Underlying AffineRankN<2> tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id, ///< ID of each participating thread + TensorCoord const& threadblock_offset, ///< Initial offset of threadblock + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord(extent.column(), extent.row()), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.column(), + threadblock_offset.row())) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_byte_offset(Fragment& frag, LongIndex byte_offset) { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_pointer_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_byte_offset(Fragment const& frag, LongIndex byte_offset) { + iterator_.store_with_byte_offset(frag, byte_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_pointer_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for interleaved data. +/// It is mapped to the congruous layout. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// + +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize, + int InterleavedK> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::ColumnMajorInterleaved, + AdvanceRank, + ThreadMap_, + AccessSize, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + static int const kInterleavedK = InterleavedK; + using Layout = layout::ColumnMajorInterleaved; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileIteratorResidualLast< + layout::PitchLinearShape< + Shape::kRow * kInterleavedK, + Shape::kColumn / kInterleavedK>, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 0 : 1), + ThreadMap, + AccessSize>; + + using AccessType = typename UnderlyingIterator::AccessType; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))) {} + + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord( + extent.row() * kInterleavedK, + extent.column() / kInterleavedK), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.row() * kInterleavedK, + threadblock_offset.column() / kInterleavedK)) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_pointer_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_pointer_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization of PredicatedTileIteratorResidualLast for interleaved-32 +/// data. It is mapped to the congruous layout. +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept | +/// MaskedTileIteratorConcept +/// +template < + typename Shape_, + typename Element_, + int AdvanceRank, + typename ThreadMap_, + int AccessSize, + int InterleavedK> +class PredicatedTileIteratorResidualLast< + Shape_, + Element_, + layout::RowMajorInterleaved, + AdvanceRank, + ThreadMap_, + AccessSize, + false> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + static int const kInterleavedK = InterleavedK; + using Layout = layout::RowMajorInterleaved; + static int const kAdvanceRank = AdvanceRank; + using ThreadMap = ThreadMap_; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorView = TensorView; + using TensorCoord = typename Layout::TensorCoord; + + using Pointer = Element*; + using NonConstPointer = typename platform::remove_const::type*; + + using UnderlyingIterator = PredicatedTileIteratorResidualLast< + layout::PitchLinearShape< + Shape::kColumn * kInterleavedK, + Shape::kRow / kInterleavedK>, + Element, + layout::PitchLinear, + (kAdvanceRank == 0 ? 1 : 0), + ThreadMap, + AccessSize>; + + using AccessType = typename UnderlyingIterator::AccessType; + + /// Fragment object to be loaded or stored + using Fragment = cutlass::Array< + Element, + ThreadMap::Iterations::kCount * ThreadMap::kElementsPerAccess>; + + /// Predicate vector stores mask to guard accesses + using Mask = typename UnderlyingIterator::Mask; + + /// Parameters object is precomputed state and is host-constructible + class Params { + private: + friend PredicatedTileIteratorResidualLast; + + /// Parameters object + typename UnderlyingIterator::Params params_; + + public: + CUTLASS_HOST_DEVICE + Params() {} + + /// Construct the Params object given a pitch-linear tensor's layout + CUTLASS_HOST_DEVICE + Params(Layout const& layout) + : params_(layout::PitchLinear(layout.stride(0))) {} + + CUTLASS_HOST_DEVICE + Params(typename UnderlyingIterator::Params::Base const& base) + : params_(base) {} + }; + + private: + // + // Data members + // + + /// Underlying pitch-linear tile iterator + UnderlyingIterator iterator_; + + public: + /// Constructs a TileIterator from its precomputed state, threadblock offset, + /// and thread ID + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + /// Precomputed parameters object + Params const& params, + /// Pointer to start of tensor + Pointer pointer, + /// Extent of tensor + TensorCoord extent, + /// ID of each participating thread + int thread_id, + /// Initial offset of threadblock + TensorCoord const& threadblock_offset, + int const* indices = + nullptr ///< gather/scatter indices, note no support for + ///< gather/scatter at this specialization + ) + : iterator_( + params.params_, + pointer, + layout::PitchLinearCoord( + extent.column() * kInterleavedK, + extent.row() / kInterleavedK), + thread_id, + layout::PitchLinearCoord( + threadblock_offset.column() * kInterleavedK, + threadblock_offset.row() / kInterleavedK)) {} + + /// Construct a PredicatedTileIteratorResidualLast with zero threadblock + /// offset + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast( + Params const& params, ///< Precomputed parameters object + Pointer pointer, ///< Pointer to start of tensor + TensorCoord extent, ///< Extent of tensor + int thread_id ///< ID of each participating thread + ) + : PredicatedTileIteratorResidualLast( + params, + pointer, + extent, + thread_id, + make_Coord(0, 0)) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast& operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + /// + /// The first time this method is called, predicates are updated, and the + /// iterator's internal pointer is reverted to the first "steady state" tile. + /// Subsequent calls are lightweight and must only update the internal + /// pointer. + CUTLASS_HOST_DEVICE + PredicatedTileIteratorResidualLast operator++(int) { + PredicatedTileIteratorResidualLast self(*this); + operator++(); + return self; + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void clear_mask(bool enable = true) { + iterator_.clear_mask(enable); + } + + CUTLASS_HOST_DEVICE + void set_residual_tile(bool enable) { + iterator_.set_residual_tile(enable); + } + + /// Clears the predicate set efficiently + CUTLASS_HOST_DEVICE + void enable_mask() { + iterator_.enable_mask(); + } + + /// Sets the predicate mask, overriding value stored in predicate iterator + CUTLASS_HOST_DEVICE + void set_mask(Mask const& mask) { + iterator_.set_mask(mask); + } + + /// Gets the mask + CUTLASS_HOST_DEVICE + void get_mask(Mask& mask) { + iterator_.get_mask(mask); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment& frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment& frag) { + load_with_pointer_offset(frag, 0); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const& frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const& frag) { + store_with_pointer_offset(frag, 0); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace transform +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/transpose_warp_iterator.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/transpose_warp_iterator.h new file mode 100644 index 0000000000000000000000000000000000000000..916d436d7e86a8af36167a780ff726ef5b410ac8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/transpose_warp_iterator.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include + +template +struct TransposeWarpIterator { + using Iterator = char; + static bool constexpr kSupportsTranspose = false; +}; + +template < + /// Operand identity + cutlass::gemm::Operand Operand, + /// Data type of A elements + typename Element, + typename InstructionShape, + bool kTranspose> +struct TransposeWarpIterator< + cutlass::gemm::warp:: + WarpIteratorFromSmem> { + using Iterator = cutlass::gemm::warp:: + WarpIteratorFromSmem; + static bool constexpr kSupportsTranspose = true; +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/warp_iterator_from_smem.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/warp_iterator_from_smem.h new file mode 100644 index 0000000000000000000000000000000000000000..000ad3f97ac2d3f9918cb37dd3246ee5482dded8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/iterators/warp_iterator_from_smem.h @@ -0,0 +1,289 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/*************************************************************************************************** + * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights + *reserved. SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + *POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Inspired from + "cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h" Loads tiles of GEMM + operands from a RowMajor shared-memory layout into registers to use by A100 + TensorCores. + + The difference with "mma_tensor_op_tile_access_iterator.h" is that: + (1) We use "ldmatrix" to load tiles, rather than manual loads (slightly + faster) (2) We support to transpose the operand (eg read `A.transpose()` when + the shared memory holds `A`) + + This is only implemented for the specific shapes. +*/ +#pragma once + +#include + +//////////////////////////////////////////////////////////////////////////////// +namespace cutlass { +namespace gemm { +namespace warp { + +template < + /// Operand identity + Operand Operand_, + /// Data type of A elements + typename Element_, + typename InstructionShape_, + bool kTranspose = false> +class WarpIteratorFromSmem { + public: + /// Shape of tile to load (concept: MatrixShape) + using Shape = cutlass::MatrixShape<32, 32>; + + /// Operand tag + static Operand const kOperand = Operand_; + static_assert( + kOperand == Operand::kA, + "No support for OperandB at the moment"); + + /// Basic check + static_assert( + kOperand == Operand::kA || kOperand == Operand::kB, + "WarpIteratorFromSmem may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + static_assert(sizeof_bits::value == 16, "Only supported for half"); + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + static_assert(InstructionShape::kRow == 16, "Only supports 16x8x8 / 16x8x16"); + static_assert( + InstructionShape::kColumn == 8 || InstructionShape::kColumn == 16, + "Only supports 16x8x8 / 16x8x16"); + + /// Delta between *MMA operations (in units of *MMA operations, concept: + /// MatrixShape) + static int const kOpDelta = 1; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Number of elements accessed per Shared Memory load + static int const kElementsPerAccess = + (sizeof_bits::value >= 32 ? 1 + : 32 / sizeof_bits::value); + + using InstructionCount = MatrixShape< + Shape::kRow / InstructionShape::kRow, + Shape::kColumn / InstructionShape::kColumn>; + + static int const kIterations = (kOperand == Operand::kA) + ? InstructionCount::kColumn + : InstructionCount::kRow; + + public: + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = Array< + Element, + (kOperand == Operand::kA) + ? (Shape::kRow* InstructionShape::kColumn / kThreads) + : (Shape::kColumn* InstructionShape::kRow / kThreads)>; + + /// Memory access type + // using AccessType = AlignedArray; + using AccessType = Array; + + static int constexpr kWarpShapeDivisibleInner = + (kOperand == Operand::kA ? InstructionShape::kColumn + : InstructionShape::kRow); + static int constexpr kAccessesInner = + (kWarpShapeDivisibleInner / kElementsPerAccess) / 4; + // Number of 32bits tiles to load per `ldmatrix` + static int const kTilesPerInstruction = InstructionShape::kRow / 8; + static_assert(kTilesPerInstruction == 2, "Only supports 16x8x16 and 16x8x8"); + + private: + /// Underlying tensor reference + TensorRef ref_; + + /// Origin + MatrixCoord origin_; + + /// Iterations in a tile + int iterations_; + + public: + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + WarpIteratorFromSmem(TensorRef const& ref, int lane_id) + : WarpIteratorFromSmem(ref, {Shape::kRow, Shape::kColumn}, lane_id) {} + CUTLASS_HOST_DEVICE + WarpIteratorFromSmem(TensorRef const& ref, TensorCoord extent, int lane_id) + : ref_(ref), iterations_(0) { + // See also: + // https://docs.nvidia.com/cuda/archive/11.7.1/parallel-thread-execution/index.html#warp-level-matrix-fragment-mma-1688 + // 16x8x8: kAccessesInner = 1 (1 ldmatrix.x4) + // 16x8x16: kAccessesInner = 2 (2 ldmatrix.x4) + int ldsm_vec_num = (lane_id >> 3); + if (kOperand == Operand::kA) { + origin_ = MatrixCoord(lane_id % 8, 0); + static_assert( + InstructionCount::kRow * kTilesPerInstruction == 4, + "can't use ldmatrix.x4"); + int access_m_idx = ldsm_vec_num % kTilesPerInstruction; + int inner_idx = (ldsm_vec_num / kTilesPerInstruction) % kAccessesInner; + int inst_m_idx = ldsm_vec_num / (kTilesPerInstruction * kAccessesInner); + MatrixCoord offset( + access_m_idx * 8 + inst_m_idx * InstructionShape::kRow, + inner_idx * 4 * kElementsPerAccess); + if (kTranspose) { + offset = MatrixCoord(offset.column(), offset.row()); + } + origin_ += offset; + } else { + // XXX: This is not tested or used + origin_ = MatrixCoord(0, lane_id % 8); + static_assert(InstructionCount::kColumn * kAccessesInner == 4, ""); + CUTLASS_PRAGMA_UNROLL + for (int inst_n_idx = 0; inst_n_idx < InstructionCount::kColumn; + ++inst_n_idx) { + CUTLASS_PRAGMA_UNROLL + for (int inner_idx = 0; inner_idx < kAccessesInner; ++inner_idx) { + int access_idx = inner_idx + kAccessesInner * inst_n_idx; + + MatrixCoord offset( + inner_idx * 4 * kElementsPerAccess, inst_n_idx * 8); + + if (access_idx == ldsm_vec_num) { + if (kTranspose) { + offset = MatrixCoord(offset.column(), offset.row()); + } + origin_ += offset; + } + } + } + } + + ref_.add_coord_offset(origin_); + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + WarpIteratorFromSmem& add_tile_offset(TensorCoord const& tile_offset) { + TensorCoord coord_offset( + tile_offset.row() * Shape::kRow, tile_offset.column() * Shape::kColumn); + if (kTranspose) { + coord_offset = TensorCoord{coord_offset.column(), coord_offset.row()}; + } + origin_ += coord_offset; + + ref_.add_coord_offset(coord_offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + void advance() { + if (kOperand == Operand::kA) { + add_tile_offset({0, 1}); + } else { + add_tile_offset({1, 0}); + } + + iterations_ = 0; + } + + /// increase iterations in a tile + CUTLASS_HOST_DEVICE + WarpIteratorFromSmem& operator++() { + iterations_++; + + if (iterations_ >= kIterations) + advance(); + + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_DEVICE + void load(Fragment& frag) const { + AccessType* access_ptr = reinterpret_cast(&frag); + using LoadLayout = typename platform:: + conditional::type; + + CUTLASS_PRAGMA_UNROLL + for (int access_m_idx = 0; access_m_idx < + (InstructionCount::kRow * kTilesPerInstruction * kAccessesInner) / 4; + ++access_m_idx) { + MatrixCoord offset; + if (kOperand == Operand::kA) { + offset = MatrixCoord( + access_m_idx * 16, iterations_ * InstructionShape::kColumn); + } else { + offset = MatrixCoord(iterations_ * InstructionShape::kRow, 0); + } + if (kTranspose) { + offset = MatrixCoord(offset.column(), offset.row()); + } + cutlass::arch::ldsm( + access_ptr[access_m_idx], ref_.data() + ref_.offset(offset)); + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass +//////////////////////////////////////////////////////////////////////////////// + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..6fb3a6a1dc8297ee1e3ba547ee2df92c3ce6d8fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_backward.h @@ -0,0 +1,2619 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +using namespace gemm_kernel_utils; + +namespace PyTorchMemEffAttention { +namespace { + +template +struct GmemTile { + /* + Helper functions to efficient store/load RF to gmem + + GEMM accumulators have a particular format on A100, and + it takes some compute/shared-memory to rearrange them to + a RowMajor or ColumnMajor format in global memory through + an Epilogue. The same complexity goes for loading into RF. + + This class loads/stores RF as they are, and can be used for + efficient accumulation across gemms for instance: + + ``` + GmemTile tile; + for (int i = 0; i < N; ++i) { + // ... + + Fragment accum; + if (i == 0) { + accum.clear(); + } else { + tile.load(accum); + } + mma(accum, ...); + if (i < N-1) { + // Store for next GEMM + tile.store(accum); + } else { + // Store in tensor (eg RowMajor) + epilogue(accum); + } + + // ... + } + ``` + */ + + // 128bits per thread + using AccessType = cutlass::Array; + static constexpr int32_t kBytes = sizeof(AccessType); + static constexpr int32_t kStride = kNumThreads * AccessType::kElements; + static constexpr int32_t kNumIters = + FragmentType::kElements / AccessType::kElements; + static constexpr int32_t kElementsStored = + kNumThreads * FragmentType::kElements; + static_assert( + FragmentType::kElements % AccessType::kElements == 0, + "fragment not aligned on 128 bits"); + + float* ptr; + + CUTLASS_DEVICE void load(FragmentType& fragment, int thread_id) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kNumIters; ++i) { + AccessType* __restrict__ gmem_ptr = reinterpret_cast( + ptr + thread_id * AccessType::kElements + i * kStride); + AccessType sub_fragment; + cutlass::arch::global_load( + sub_fragment, gmem_ptr, true); + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < AccessType::kElements; ++j) { + fragment[i * AccessType::kElements + j] = sub_fragment[j]; + } + } + } + + CUTLASS_DEVICE void store(FragmentType const& fragment, int thread_id) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kNumIters; ++i) { + AccessType* __restrict__ gmem_ptr = reinterpret_cast( + ptr + thread_id * AccessType::kElements + i * kStride); + AccessType sub_fragment; + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < AccessType::kElements; ++j) { + sub_fragment[j] = fragment[i * AccessType::kElements + j]; + } + cutlass::arch::global_store( + sub_fragment, gmem_ptr, true); + } + } + + CUTLASS_DEVICE void storeAtomicAdd( + FragmentType const& fragment, + int thread_id) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kNumIters; ++i) { + float* gmem_ptr = ptr + thread_id * AccessType::kElements + i * kStride; + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < AccessType::kElements; ++j) { + float val = fragment[i * AccessType::kElements + j]; + float* ptr = gmem_ptr + j; + atomicAdd(ptr, val); + } + } + } +}; + +struct AtomicLock { + CUTLASS_DEVICE static void acquire( + int32_t* lock, + int set_val, + int thread_id) { + if (thread_id == 0) { + while (atomicCAS(lock, 0 /*cmp*/, set_val /*setval*/) != set_val) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + __nanosleep(40); +#endif + } + } + __syncthreads(); + } + CUTLASS_DEVICE static void release(int32_t* lock, int thread_id) { + if (thread_id == 0) { + int status = 0; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + asm volatile("st.global.release.gpu.b32 [%0], %1;\n" + : + : "l"(lock), "r"(status)); +#else + asm volatile("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); +#endif + } + } +}; + +template +constexpr int getWarpsPerSmBw() { + bool is_half = !cutlass::platform::is_same::value; + if (Arch::kMinComputeCapability >= 80) { + return is_half ? 12 : 8; + } + return 8; +} +} // namespace + +template < + // which arch we target (eg `cutlass::arch::Sm80`) + typename ArchTag_, + // input/output type + typename scalar_t_, + // run optimized kernel because memory accesses will be aligned + bool kIsAligned_, + // use dropout if enabled + bool kApplyDropout_, + // when doing a GEMM, preload the next one (uses more shmem) + bool kPreload_, + // block dimensions + int kBlockSizeI_, + int kBlockSizeJ_, + // upperbound on `max(value.shape[-1], query.shape[-1])` + int kMaxK_ = (int)cutlass::platform::numeric_limits::max(), + // assumes that `cu_seqlen` is None, and + // (1) `num_queries % kBlockSizeI == 0` + // (2) `num_keys % kBlockSizeJ == 0` + bool kKeysQueriesAlignedToBlockSize_ = false> +struct AttentionBackwardKernel { + enum CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, + }; + using scalar_t = scalar_t_; + using output_t = scalar_t; + using output_accum_t = float; + using lse_scalar_t = float; + using accum_t = float; + using ArchTag = ArchTag_; + static constexpr bool kIsAligned = kIsAligned_; + static constexpr bool kApplyDropout = kApplyDropout_; + static constexpr bool kPreload = kPreload_; + static constexpr int kBlockSizeI = kBlockSizeI_; + static constexpr int kBlockSizeJ = kBlockSizeJ_; + static constexpr int kMaxK = kMaxK_; + static constexpr bool kKeysQueriesAlignedToBlockSize = + kKeysQueriesAlignedToBlockSize_; + + static constexpr int64_t kWarpSize = 32; + + // If this is true, we store and accumulate dK/dV in RF + // rather than going back to gmem every time + static constexpr bool kIsHalf = cutlass::sizeof_bits::value <= 16; + static constexpr bool kOutputInRF = kIsHalf && kMaxK <= kBlockSizeI; + static_assert( + !kPreload || + (kIsHalf && ArchTag::kMinComputeCapability >= 80 && kOutputInRF), + "preload MMA not supported"); + static constexpr bool kPrologueQK = kPreload; + static constexpr bool kPrologueGV = kPreload; + static constexpr bool kPrologueDOV = kPreload; + static constexpr bool kPrologueGQ = kPreload; + static constexpr bool kPrologueGK = kPreload; + + static constexpr int64_t kNumWarpsPerBlock = + (kBlockSizeI * kBlockSizeJ) / (32 * 32); + + // Compute delta for the f16 kernels + // TODO: Figure out why it's slower on the f32 kernels + // (something due to RF pressure?) + // TODO: Remove condition on `kOutputInRF` - this is needed to work + // around a compiler bug on V100, not exactly sure why but I spent + // too much time on this already. Reproducible with + // (B, Mq, Mkv, K) = (1, 1, 1, 136) for instance + static constexpr bool kKernelComputesDelta = + kIsHalf && (kOutputInRF || ArchTag::kMinComputeCapability != 70); + + // Launch bounds + static constexpr int64_t kNumThreads = kWarpSize * kNumWarpsPerBlock; + static constexpr int64_t kMinBlocksPerSm = + getWarpsPerSmBw() / kNumWarpsPerBlock; + + using GemmType = DefaultGemmType; + using DefaultConfig = + typename cutlass::gemm::device::DefaultGemmConfiguration< + typename GemmType::OpClass, + ArchTag, + scalar_t, + scalar_t, + scalar_t, // ElementC + accum_t // ElementAccumulator + >; + static constexpr auto kOptimalAlignement = cutlass::platform::max( + DefaultConfig::kAlignmentA, + DefaultConfig::kAlignmentB); + static constexpr auto kMinimumAlignment = GemmType::kMinimumAlignment; + + struct MatmulQK { + /* + attn_T = k_j @ q_i.transpose(-2, -1) # matmul + attn_T = (attn_T - logsumexp[i_start:i_end].unsqueeze(1).transpose(-2, + -1)).exp() # epilogue + + with attn_T.shape = (kBlockSizeJ, kBlockSizeI) + */ + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using DefaultMma = typename cutlass::gemm::threadblock::DefaultMma< + scalar_t, // ElementA + cutlass::layout::RowMajor, // LayoutA + kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment, + scalar_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + accum_t, // ElementC + cutlass::layout::RowMajor, // LayoutC + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + DefaultConfig::kStages, + typename GemmType::Operator, + false, // AccumulatorsInRowMajor = false, + cutlass::gemm::SharedMemoryClearOption::kNone>; + using MmaCore = typename DefaultMma::MmaCore; + using Mma = + typename MakeCustomMma::Mma; + + // used for efficient load of bias tile (Bij) from global memory to shared + // memory + using BiasLoader = TileSmemLoader< + scalar_t, + // Bij is applied to transposed attn matrix tile (Pij.T). Bij is loaded + // row-major but needs to have transposed shape so we get the same + // elements. + cutlass::MatrixShape, + MmaCore::kThreads, + // input restriction: kv_len has to be a multiple of this value + 128 / cutlass::sizeof_bits::value>; + + // Epilogue to store to shared-memory in a format that we can use later for + // the second matmul + using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm< + typename Mma::Operator::IteratorC, + typename Mma::Operator, + scalar_t, + WarpShape, + ThreadblockShape>; + using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator< + typename Mma::Operator::IteratorC, + accum_t, + kWarpSize>::Iterator; + using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage; + }; + + struct MatmulGradV { + /* + grad_v[j_start:j_end] += attn_T @ do_i # matmul + + Dimensions: (kBlockSizeJ * kNumWarpsPerBlock, kBlockSizeI, K) + (we might need to iterate multiple times on K) + */ + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + DefaultConfig::kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::RowMajor, // LayoutB, + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + output_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + // if dropout: + // for computing dVj += (Pij.T * Zij) @ dOi + // Pij_dropped.T = Pij.T * Zij is computed on the fly as fragments of + // Pij.T are loaded in. The reason we do it this way is because Pij.T and + // Zij are reused in later steps, while Pij_dropped.T is only needed in + // this step. computing Pij_dropped.T on the fly allows us to avoid + // keeping all 3 of Pij_dropped.T, Pij.T, and Zij in shared memory at the + // same time. + // if no dropout: + // for computing dVj += Pij.T @ dOi + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Operator::Shape, // WarpShape + typename DefaultGemm::Mma::Operator:: + InstructionShape, // InstructionShape + typename DefaultGemm::Mma::Operator:: + IteratorA, // RegularWarpIterator + typename DefaultGemm::Mma::Policy // Policy + >::WarpIterator; + using DefaultMmaFromSmem = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulQK::AccumulatorSharedStorage::Shape::kN, + WarpIteratorA, + kApplyDropout>; // kScaleOperandA + + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + + // Epilogue + using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp; + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::MakePrefetchableIterator< + typename DefaultEpilogue::OutputTileIterator>::Iterator; + using AccumTileGmem = GmemTile; + }; + + struct MatmulDOIVJ { + /* + doi_t_vj = do_i @ v_j.transpose(-2, -1) # matmul + tmp = (doi_t_vj - Di.unsqueeze(1)) * attn # inplace / epilogue? + */ + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + + using ElementC = output_t; + using ElementAccum = accum_t; + + // no-op output op - epilogue just stores result to global memory + using BiasGradEpilogueOutputOp = + typename cutlass::epilogue::thread::LinearCombination< + ElementC, + DefaultConfig::EpilogueOutputOp::kCount, + typename DefaultConfig::EpilogueOutputOp::ElementAccumulator, + typename DefaultConfig::EpilogueOutputOp::ElementCompute, + cutlass::epilogue::thread::ScaleType::Nothing>; + + using DefaultGemm = typename cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA + cutlass::layout::RowMajor, // LayoutA + kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment, + scalar_t, // ElementB + cutlass::layout::ColumnMajor, // LayoutB + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + ElementC, // ElementC + cutlass::layout::RowMajor, // LayoutC + ElementAccum, // ElementAccumulator + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + BiasGradEpilogueOutputOp, // EpilogueOutputOp + void, // ThreadblockSwizzle (not used) + // multiple preloads, dropout Zij tile, and 3 stages push us over shared + // memory capacity on A100. set a ceiling on number of stages to save + // shared memory if dropout is in use. + kPreload && kApplyDropout && (kBlockSizeI * kBlockSizeJ > 64 * 64) + ? cutlass::const_min(2, DefaultConfig::kStages) + : DefaultConfig::kStages, // Stages + false, // SplitKSerial + typename GemmType::Operator, + cutlass::gemm::SharedMemoryClearOption::kNone>; + using Mma = typename MakeCustomMma::Mma; + using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator< + typename Mma::Operator::IteratorC, + ElementAccum, + kWarpSize>::Iterator; + + // epilogue used to write bias gradient, which is just the output of this + // matmul with some operations applied to the fragment + using BiasGradEpilogue = typename DefaultGemm::Epilogue; + + // Epilogue to store to shared-memory in a format that we can use later for + // the second matmul + using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm< + typename DefaultGemm::Mma::Operator::IteratorC, + typename DefaultGemm::Mma::Operator, + scalar_t, + WarpShape, + ThreadblockShape>; + using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage; + }; + + struct MatmulGradQ { + // grad_q <- tmp @ k_j + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + DefaultConfig::kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::RowMajor, // LayoutB, + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + output_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Operator::Shape, + typename DefaultGemm::Mma::Operator::InstructionShape, + typename DefaultGemm::Mma::Operator::IteratorA, + typename DefaultGemm::Mma::Policy>::WarpIterator; + using DefaultMmaFromSmem = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulDOIVJ::AccumulatorSharedStorage::Shape::kN, + WarpIteratorA, + false>; // kScaleOperandA + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + + // Epilogue + using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp; + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::MakePrefetchableIterator< + typename DefaultEpilogue::OutputTileIterator>::Iterator; + using AccumTileGmem = GmemTile; + }; + struct MatmulGradK { + // grad_k <- tmp.transpose(-2, -1) @ q_i + using ThreadblockShape = + cutlass::gemm::GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + DefaultConfig::kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::RowMajor, // LayoutB, + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment, + output_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + typename GemmType::OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Operator::Shape, + typename DefaultGemm::Mma::Operator::InstructionShape, + typename DefaultGemm::Mma::Operator::IteratorA, + typename DefaultGemm::Mma::Policy>::WarpIterator; + using DefaultMmaFromSmemN = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulQK::AccumulatorSharedStorage::Shape::kN, // kMaxK + WarpIteratorA, + false>; // kScaleOperandA + using DefaultMmaFromSmemT = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MatmulDOIVJ::AccumulatorSharedStorage::Shape::kM, // kMaxK + WarpIteratorA, + false, // kScaleOperandA + kPreload>; // kTransposeA + using DefaultMmaFromSmem = typename cutlass::platform::conditional< + DefaultMmaFromSmemT::kIsTransposedA, + DefaultMmaFromSmemT, + DefaultMmaFromSmemN>::type; + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + + // Epilogue + using DefaultOutputOp = typename DefaultConfig::EpilogueOutputOp; + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::MakePrefetchableIterator< + typename DefaultEpilogue::OutputTileIterator>::Iterator; + using AccumTileGmem = GmemTile; + }; + + // NOTE: nvcc 12.4 has correctness errors with this on M60 (sm52) + // when there is an attention bias. Let's just disable it for now. + static constexpr auto kMinSm = ArchTag::kMinComputeCapability; + static constexpr bool kEnableSplitKeys = kMinSm >= 70; + + static constexpr bool kNeedsAccumGradQ = kEnableSplitKeys || + !cutlass::platform::is_same::value; + static constexpr bool kNeedsAccumGradK = !kOutputInRF && + !cutlass::platform::is_same::value; + static constexpr bool kNeedsAccumGradV = !kOutputInRF && + !cutlass::platform::is_same::value; + + struct GradQTempStorage { + int32_t lock; + int32_t counter; + int32_t pad[2]; // pad to 128bits + output_accum_t buffer[MatmulGradQ::AccumTileGmem::kElementsStored]; + }; + + struct Params { + // Input tensors + const scalar_t* query_ptr = nullptr; // [Mq, nH, K] + const scalar_t* key_ptr = nullptr; // [Mk, nH, K] + const scalar_t* value_ptr = nullptr; // [Mk, nH, Kv] + const scalar_t* bias_ptr = nullptr; + const lse_scalar_t* logsumexp_ptr = nullptr; // [nH, Mq] + const scalar_t* output_ptr = nullptr; // [Mq, nH, Kv] + const scalar_t* grad_output_ptr = nullptr; // [Mq, nH, Kv] + accum_t* delta_ptr = nullptr; // [nH, Mq] + const int32_t* cu_seqlens_q_ptr = nullptr; + const int32_t* cu_seqlens_k_ptr = nullptr; + + // Output tensors + output_t* grad_query_ptr = nullptr; // [Mq, nH, K] + output_t* grad_key_ptr = nullptr; // [Mk, nH, K] + output_t* grad_value_ptr = nullptr; // [Mk, nH, Kv] + output_t* grad_bias_ptr = nullptr; + + // Accumulators + output_accum_t* workspace = nullptr; // [Mq, Kq] + [Mkv, Kq] + [Mkv, Kv] + output_accum_t* workspace_gv = + nullptr; // (will be calculated by the kernel) + GradQTempStorage* workspace_gq = + nullptr; // (will be calculated by the kernel) + + // Sliding window. ignored if == 0 + int32_t window_size = 0; + + // Scale + accum_t scale = 1.0f; + + // Dimensions/strides + int32_t head_dim = -1; + int32_t head_dim_value = -1; + int32_t num_queries = -1; + int32_t num_keys = -1; + int32_t num_heads = -1; + uint8_t custom_mask_type = NoCustomMask; + + int64_t q_strideM = -1; + int64_t k_strideM = -1; + int64_t v_strideM = -1; + int64_t bias_strideM = 0; + int64_t gO_strideM = -1; + int64_t gB_strideM = -1; + int8_t gQKV_strideM_multiplier = 1; // 3 for packed, 1 otherwise + + at::PhiloxCudaState rng_engine_inputs = {0, 0}; + + // RNG sequence offset based on batch_id and head_id + unsigned long long dropout_batch_head_rng_offset = 0; + float dropout_prob = 0.0f; + + CUTLASS_HOST_DEVICE int64_t o_strideM() const { + return head_dim_value * num_heads; + } + CUTLASS_HOST_DEVICE int64_t gQ_strideM() const { + return gQKV_strideM_multiplier * num_heads * head_dim; + } + CUTLASS_HOST_DEVICE int64_t gK_strideM() const { + return gQKV_strideM_multiplier * num_heads * head_dim; + } + CUTLASS_HOST_DEVICE int64_t gV_strideM() const { + return gQKV_strideM_multiplier * num_heads * head_dim_value; + } + + // Everything below is only used in `advance_to_block` + // and shouldn't use registers + int64_t o_strideH = -1; + int32_t q_strideH = -1; + int32_t k_strideH = -1; + int32_t v_strideH = -1; + int64_t bias_strideH = 0; + int64_t o_strideB = -1; + int64_t q_strideB = -1; + int64_t k_strideB = -1; + int64_t v_strideB = -1; + int64_t bias_strideB = 0; + int64_t lse_strideB = -1; + int64_t lse_strideH = -1; + int64_t delta_strideB = -1; + int64_t delta_strideH = -1; + int32_t num_batches = -1; + int16_t num_splits_key = 1; // We use `gridDim.x` inside kernel + + int64_t gO_strideB = 0; + int64_t gQ_strideB = 0; + int64_t gK_strideB = 0; + int64_t gV_strideB = 0; + int64_t gB_strideB = 0; + int64_t gO_strideH = 0; + int64_t gQ_strideH = 0; + int64_t gK_strideH = 0; + int64_t gV_strideH = 0; + int64_t gB_strideH = 0; + + CUTLASS_HOST_DEVICE int16_t num_splits_key_device() const { +#ifdef __CUDA_ARCH__ + return kEnableSplitKeys ? gridDim.x : 1; +#else + return num_splits_key; // for host-side tests +#endif + } + CUTLASS_HOST_DEVICE int16_t split_key_device() const { +#ifdef __CUDA_ARCH__ + return kEnableSplitKeys ? blockIdx.x : 0; +#else + return 0; // for host-side tests +#endif + } + + CUTLASS_DEVICE bool advance_to_block() { + int64_t batch_id = blockIdx.z; + int32_t head_id = blockIdx.y; + + if (kNeedsAccumGradQ || kNeedsAccumGradK || kNeedsAccumGradV) { + assert(workspace_size() == 0 || workspace != nullptr); + + workspace += (batch_id * num_heads + head_id) * workspace_strideBH(); + workspace = warp_uniform(workspace); + workspace_gv = workspace + workspace_elements_gk(); + workspace_gq = + (GradQTempStorage*)(workspace_gv + workspace_elements_gv()); + if (kEnableSplitKeys) { + workspace_gv += workspace_elements_gv() * split_key_device() / + num_splits_key_device(); + workspace += workspace_elements_gk() * split_key_device() / + num_splits_key_device(); + } + } else { + workspace = nullptr; + } + + // Advance pointers that depend on the total concatenated + // number of queries, as `num_queries` is modified in the block + // below + dropout_batch_head_rng_offset = + batch_id * (num_heads * num_queries * num_keys) + + head_id * (num_queries * num_keys); + logsumexp_ptr += batch_id * lse_strideB + head_id * lse_strideH; + + if (cu_seqlens_q_ptr != nullptr) { + assert(cu_seqlens_k_ptr != nullptr); + cu_seqlens_q_ptr += batch_id; + cu_seqlens_k_ptr += batch_id; + int32_t q_start = cu_seqlens_q_ptr[0]; + int32_t k_start = cu_seqlens_k_ptr[0]; + int64_t q_next_start = cu_seqlens_q_ptr[1]; + int64_t k_next_start = cu_seqlens_k_ptr[1]; + assert(q_next_start - q_start <= num_queries); + assert(k_next_start - k_start <= num_keys); + num_queries = q_next_start - q_start; + num_keys = k_next_start - k_start; + + // Jump manually + batch_id = 0; + + query_ptr += q_start * q_strideM; + key_ptr += k_start * k_strideM; + value_ptr += k_start * v_strideM; + assert(bias_ptr == nullptr); + assert(grad_bias_ptr == nullptr); + output_ptr += q_start * o_strideM(); + grad_output_ptr += q_start * gO_strideM; + delta_ptr += q_start; + + grad_query_ptr += q_start * gQ_strideM(); + grad_key_ptr += k_start * gK_strideM(); + grad_value_ptr += k_start * gV_strideM(); + } + + query_ptr += batch_id * q_strideB + head_id * q_strideH; + key_ptr += batch_id * k_strideB + head_id * k_strideH; + value_ptr += batch_id * v_strideB + head_id * v_strideH; + if (bias_ptr != nullptr) { + bias_ptr += batch_id * bias_strideB + head_id * bias_strideH; + } + output_ptr += batch_id * o_strideB + head_id * o_strideH; + grad_output_ptr += batch_id * gO_strideB + head_id * gO_strideH; + delta_ptr += batch_id * delta_strideB + head_id * delta_strideH; + + grad_query_ptr += batch_id * gQ_strideB + head_id * gQ_strideH; + grad_key_ptr += batch_id * gK_strideB + head_id * gK_strideH; + grad_value_ptr += batch_id * gV_strideB + head_id * gV_strideH; + if (grad_bias_ptr != nullptr) { + grad_bias_ptr += batch_id * gB_strideB + head_id * gB_strideH; + } + + // Some values are modified above + // Signal to the compiler that they are the same in all threads + // and can be stored in warp-uniform registers (Sm75+) + num_queries = warp_uniform(num_queries); + num_keys = warp_uniform(num_keys); + custom_mask_type = warp_uniform(custom_mask_type); + + query_ptr = warp_uniform(query_ptr); + key_ptr = warp_uniform(key_ptr); + value_ptr = warp_uniform(value_ptr); + bias_ptr = warp_uniform(bias_ptr); + logsumexp_ptr = warp_uniform(logsumexp_ptr); + output_ptr = warp_uniform(output_ptr); + grad_output_ptr = warp_uniform(grad_output_ptr); + delta_ptr = warp_uniform(delta_ptr); + + grad_query_ptr = warp_uniform(grad_query_ptr); + grad_key_ptr = warp_uniform(grad_key_ptr); + grad_value_ptr = warp_uniform(grad_value_ptr); + grad_bias_ptr = warp_uniform(grad_bias_ptr); + +#if 0 + PRINT_T0("[b:%d h:%d] dp[0]:%f Q:%f K:%f V:%f LSE:%f", + int(blockIdx.z), int(blockIdx.y), + float(delta_ptr[0]), + float(query_ptr[0]), float(key_ptr[0]), float(value_ptr[0]), + float(logsumexp_ptr[0]) + ) +#endif + return true; + } + + __host__ dim3 getBlocksGrid() const { + return dim3(num_splits_key, num_heads, num_batches); + } + __host__ dim3 getThreadsGrid() const { + return dim3(kWarpSize * kNumWarpsPerBlock, 1, 1); + } + CUTLASS_HOST_DEVICE int64_t workspace_elements_gk() const { + if (!kNeedsAccumGradK) { + return 0; + } + return num_splits_key * kBlockSizeJ * + align_up(head_dim, kBlockSizeI); + } + CUTLASS_HOST_DEVICE int64_t workspace_elements_gv() const { + if (!kNeedsAccumGradV) { + return 0; + } + return num_splits_key * kBlockSizeJ * + align_up(head_dim_value, kBlockSizeI); + } + CUTLASS_HOST_DEVICE int64_t workspace_elements_gq() const { + if (!kNeedsAccumGradQ) { + return 0; + } + int num_blocks = ceil_div(num_queries, kBlockSizeI); + int num_cols = ceil_div(head_dim, MatmulGradQ::ThreadblockShape::kN); + return num_blocks * num_cols * sizeof(GradQTempStorage) / + sizeof(output_accum_t); + } + CUTLASS_HOST_DEVICE int64_t workspace_strideBH() const { + // Aligned on 128bits + return align_up( + workspace_elements_gk() + workspace_elements_gv() + + workspace_elements_gq(), + int64_t(4)); + } + CUTLASS_HOST_DEVICE int64_t workspace_size() const { + // Returns size of buffer we need to run this kernel + return num_batches * num_heads * workspace_strideBH() * sizeof(float); + } + CUTLASS_HOST_DEVICE bool should_zero_workspace() const { + return num_splits_key > 1 || window_size > 0; + } + }; + + // shared storage for keeping Zij matrix. not needed if we aren't using + // dropout, in which case we use an empty array to save shared memory + using ZijSharedStorage = typename cutlass::platform::conditional< + kApplyDropout, + typename MatmulQK::AccumulatorSharedStorage, + // dummy shared storage object that takes up no space. + typename cutlass::gemm::threadblock::AccumulatorSharedStorage< +#ifdef _WIN32 + // windows builds throw the error: + // "type containing an unknown-size array is not allowed" + // if we try to make Zij shared storage zero-sized. + // To get around this just make it sized 1 on windows. + typename cutlass::gemm::GemmShape<1, 1, 0>, +#else + typename cutlass::gemm::GemmShape<0, 0, 0>, +#endif + typename MatmulQK::AccumulatorSharedStorage::Element, + typename MatmulQK::AccumulatorSharedStorage::Layout, + typename cutlass::MatrixShape<0, 0>>>::type; + + struct SharedStoragePrologue { + struct { + cutlass::Array di; // (do_i * o_i).sum(-1) + typename MatmulQK::Mma::SharedStorageA mm_qk_k; + } persistent; + union { + struct { + // part1 - after Q.K / dV / dO.V + union { + // 1. efficient load of bias tile Bij, which is then applied to Pij + typename MatmulQK::BiasLoader::SmemTile bias; + // 4. store Pij. it is needed: + // - in dVj += (Pij.T * Zij) @ dOi + // - in dSij = Pij * (dPij - Di) + // 6. dVj += (Pij.T * Zij) @ dOi + // 10. write to fragment + typename MatmulQK::AccumulatorSharedStorage attn_shared_storage; + }; + // 5. store Zij. it is needed in dVj += (Pij.T * Zij) @ dOi + ZijSharedStorage zij; + + union { + // 2. prologue for dVj + // 6. workspace for dVj += (Pij.T * Zij) @ dOi + typename MatmulGradV::Mma::SharedStorage mm_gradV; + // 7. dVj epilogue + typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue; + }; + + // 3. prologue for dPij_dropped + // 8. used in dPij_dropped = dOi @ Vj.T + typename MatmulDOIVJ::Mma::SharedStorage mm_doivj; + } part1; + + struct { + // part2 - dQ + union { + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part1) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + }; + typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload) + typename MatmulGradQ::Mma::SharedStorage mm_gradQ; // (preload) + union { + // store dB = dSij to global memory + typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue; + typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue; + }; + + } part2; + + struct { + // part3 - after last iteration on dQ's epilogue / dK + union { + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part1) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + }; + typename MatmulGradK::Mma::SharedStorage mm_gradK; // (preload) + typename MatmulGradQ::DefaultEpilogue::SharedStorage + gradQ_epilogue_lastIter; + + typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue; + } part3; + + struct { + // part4 - after last iteration on dK's epilogue / preload next K.Q_t + typename MatmulQK::Mma::SharedStorageB mm_qk_q; + + // If we reach end of current key, dump RF->gmem with "final" epilogues + typename MatmulGradK::DefaultEpilogue::SharedStorage + gradK_epilogue_final; + typename MatmulGradV::DefaultEpilogue::SharedStorage + gradV_epilogue_final; + } part4; + }; + static void print_size() { + // Field size +#define FSZ(f) int((sizeof(((SharedStoragePrologue*)0)->f))) + + printf("Total smem: %d bytes\n", int(sizeof(SharedStoragePrologue))); + printf(" persistent: %db\n", FSZ(persistent)); + printf(" mm_qk_k: %db\n", FSZ(persistent.mm_qk_k)); + printf(" part1: %db\n", FSZ(part1)); + printf(" bias: %db\n", FSZ(part1.bias)); + printf(" attn_shared_storage: %db\n", FSZ(part1.attn_shared_storage)); + printf(" zij: %db\n", FSZ(part1.zij)); + printf(" mm_gradV: %db\n", FSZ(part1.mm_gradV)); + printf(" gradV_epilogue: %db\n", FSZ(part1.gradV_epilogue)); + printf(" mm_doivj: %db\n", FSZ(part1.mm_doivj)); + printf(" part2: %db\n", FSZ(part2)); + printf(" tmpT_shared_storage: %db\n", FSZ(part2.tmpT_shared_storage)); + printf(" tmp_shared_storage: %db\n", FSZ(part2.tmp_shared_storage)); + printf(" mm_gradK: %db\n", FSZ(part2.mm_gradK)); + printf(" mm_gradQ: %db\n", FSZ(part2.mm_gradQ)); + printf(" gradB_epilogue: %db\n", FSZ(part2.gradB_epilogue)); + printf(" gradQ_epilogue: %db\n", FSZ(part2.gradQ_epilogue)); + printf(" part3: %db\n", FSZ(part3)); + printf(" tmpT_shared_storage: %db\n", FSZ(part3.tmpT_shared_storage)); + printf(" part4: %db\n", FSZ(part4)); + printf(" mm_qk_q: %db\n", FSZ(part4.mm_qk_q)); + printf( + " gradK_epilogue_final: %db\n", FSZ(part4.gradK_epilogue_final)); + printf( + " gradV_epilogue_final: %db\n", FSZ(part4.gradV_epilogue_final)); + } +// =========================================== +#define FIELD(INSIDE_STRUCT, FIELDNAME) \ + CUTLASS_DEVICE auto& FIELDNAME() { \ + return INSIDE_STRUCT.FIELDNAME; \ + } + + FIELD(persistent, di) + FIELD(persistent, mm_qk_k) + FIELD(part1, bias) + FIELD(part1, attn_shared_storage) + FIELD(part1, zij) + FIELD(part1, mm_gradV) + FIELD(part1, gradV_epilogue) + FIELD(part1, mm_doivj) + FIELD(part2, mm_gradK) + FIELD(part2, mm_gradQ) + FIELD(part2, gradB_epilogue) + FIELD(part2, gradQ_epilogue) + FIELD(part2, tmp_shared_storage) + FIELD(part3, tmpT_shared_storage) + FIELD(part3, gradQ_epilogue_lastIter) + FIELD(part3, gradK_epilogue) + FIELD(part4, mm_qk_q) + FIELD(part4, gradK_epilogue_final) + FIELD(part4, gradV_epilogue_final) + }; + + struct SharedStorageNoPrologue { + struct { + cutlass::Array di; // (do_i * o_i).sum(-1) + } persistent; + union { + struct { + // part1 - Q.K matmul + typename MatmulQK::Mma::SharedStorageA mm_qk_k; + typename MatmulQK::Mma::SharedStorageB mm_qk_q; + } part1; + + struct { + // part2 - compute gradV + union { + // 1. efficient load of bias tile Bij, which is then applied to Pij + typename MatmulQK::BiasLoader::SmemTile bias; + // 2. store Pij to shared memory. it is needed: + // - in this step, where it is used in dVj += (Pij.T * Zij) @ dOi + // - in next step where it is used in dSij = Pij * (dPij - Di) + typename MatmulQK::AccumulatorSharedStorage attn_shared_storage; + }; + // 3. store Zij. it is needed in this step, where it is used + // to compute Pij_dropped = Pij * Zij on the fly as fragments of Pij are + // loaded for the computation of dVj. + ZijSharedStorage zij; + + union { + typename MatmulGradV::Mma::SharedStorage mm_gradV; + typename MatmulGradV::DefaultEpilogue::SharedStorage gradV_epilogue; + }; + } part2; + + struct { + // part3 - DO.V matmul + union { + // first compute dPij = (dOi @ Vj.T) * Zij + // and dSij = Pij * (dPij - Di) + struct { + // (from part2) - Pij for computing dSij = Pij * (dPij - Di) + typename MatmulQK::AccumulatorSharedStorage attn_shared_storage; + // matmul to compute dOiVj + typename MatmulDOIVJ::Mma::SharedStorage mm_doivj; + }; + // then store dB = dSij to global memory + typename MatmulDOIVJ::BiasGradEpilogue::SharedStorage gradB_epilogue; + }; + } part3; + + struct { + // part4 - compute gradQ + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part2) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + union { + typename MatmulGradQ::Mma::SharedStorage mm_gradQ; + typename MatmulGradQ::DefaultEpilogue::SharedStorage gradQ_epilogue; + typename MatmulGradQ::DefaultEpilogue::SharedStorage + gradQ_epilogue_lastIter; + }; + } part4; + + struct { + // part5 - compute gradK + typename MatmulQK::AccumulatorSharedStorage + tmpT_shared_storage; // (from part2) + typename MatmulDOIVJ::AccumulatorSharedStorage tmp_shared_storage; + union { + typename MatmulGradK::Mma::SharedStorage mm_gradK; + typename MatmulGradK::DefaultEpilogue::SharedStorage gradK_epilogue; + }; + } part5; + + struct { + // part6 - store RF accumulated into gmem + typename MatmulGradK::DefaultEpilogue::SharedStorage + gradK_epilogue_final; + typename MatmulGradV::DefaultEpilogue::SharedStorage + gradV_epilogue_final; + } part6; + }; + static void print_size() { +#define FIELD_SIZEOF(f) int((sizeof(((SharedStorageNoPrologue*)0)->f))) + printf("Total smem: %d bytes\n", int(sizeof(SharedStorageNoPrologue))); + printf(" persistent: %db\n", FIELD_SIZEOF(persistent)); + printf(" part1: %db\n", FIELD_SIZEOF(part1)); + printf(" part2: %db\n", FIELD_SIZEOF(part2)); + printf(" part3: %db\n", FIELD_SIZEOF(part3)); + printf(" part4: %db\n", FIELD_SIZEOF(part4)); + printf(" part5: %db\n", FIELD_SIZEOF(part5)); + printf(" part6: %db\n", FIELD_SIZEOF(part6)); + } +// =========================================== +#define FIELD(INSIDE_STRUCT, FIELDNAME) \ + CUTLASS_DEVICE auto& FIELDNAME() { \ + return INSIDE_STRUCT.FIELDNAME; \ + } + + FIELD(persistent, di) + FIELD(part1, mm_qk_k) + FIELD(part1, mm_qk_q) + FIELD(part2, bias) + FIELD(part2, attn_shared_storage) + FIELD(part2, zij) + FIELD(part2, mm_gradV) + FIELD(part2, gradV_epilogue) + FIELD(part3, mm_doivj) + FIELD(part3, gradB_epilogue) + FIELD(part4, tmpT_shared_storage) + FIELD(part4, tmp_shared_storage) + FIELD(part4, mm_gradQ) + FIELD(part4, gradQ_epilogue) + FIELD(part4, gradQ_epilogue_lastIter) + FIELD(part5, mm_gradK) + FIELD(part5, gradK_epilogue) + FIELD(part6, gradK_epilogue_final) + FIELD(part6, gradV_epilogue_final) + }; + + using SharedStorage = typename cutlass::platform::conditional< + kPreload, + SharedStoragePrologue, + SharedStorageNoPrologue>::type; + + struct OutputFragments { + typename MatmulGradV::Mma::FragmentC gradV; + typename MatmulGradK::Mma::FragmentC gradK; + + CUTLASS_DEVICE void clear() { + gradV.clear(); + gradK.clear(); + } + }; + + static bool __host__ check_supported(Params const& p) { + CHECK_ALIGNED_PTR(p.query_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.key_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.value_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.output_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.grad_output_ptr, kMinimumAlignment); + CHECK_ALIGNED_PTR(p.bias_ptr, kMinimumAlignment); + TORCH_CHECK( + p.num_heads <= 1 || p.lse_strideH % 8 == 0, + "LSE is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_batches <= 1 || p.lse_strideB % 8 == 0, + "LSE is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_heads <= 1 || p.q_strideH % kMinimumAlignment == 0, + "query is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.k_strideH % kMinimumAlignment == 0, + "key is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.v_strideH % kMinimumAlignment == 0, + "value is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_batches <= 1 || p.q_strideB % kMinimumAlignment == 0, + "query is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_batches <= 1 || p.k_strideB % kMinimumAlignment == 0, + "key is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_batches <= 1 || p.v_strideB % kMinimumAlignment == 0, + "value is not correctly aligned (strideB)"); + TORCH_CHECK( + p.q_strideM % kMinimumAlignment == 0, + "query is not correctly aligned (strideM)"); + TORCH_CHECK( + p.k_strideM % kMinimumAlignment == 0, + "key is not correctly aligned (strideM)"); + TORCH_CHECK( + p.v_strideM % kMinimumAlignment == 0, + "value is not correctly aligned (strideM)"); + if (p.bias_ptr) { + TORCH_CHECK( + p.num_batches <= 1 || p.bias_strideB % kMinimumAlignment == 0, + "attn_bias is not correctly aligned (strideB). ", + "attn_bias.stride(0) = ", p.bias_strideB, ", and should be a " + "multiple of ", kMinimumAlignment, "."); + TORCH_CHECK( + p.num_heads <= 1 || p.bias_strideH % kMinimumAlignment == 0, + "attn_bias is not correctly aligned (strideH) ." + "attn_bias.stride(1) = ", p.bias_strideH, ", and should be a " + "multiple of ", kMinimumAlignment, "."); + TORCH_CHECK( + p.num_queries <= 1 || p.bias_strideM % kMinimumAlignment == 0, + "attn_bias is not correctly aligned (strideM). " + "attn_bias.stride(2) = ", p.bias_strideM, ", and should be a ", + "multiple of ", kMinimumAlignment, "."); + } + if (p.grad_bias_ptr) { + TORCH_CHECK( + p.num_batches <= 1 || p.gB_strideB % kMinimumAlignment == 0, + "attn_bias.grad is not correctly aligned (strideB)"); + TORCH_CHECK( + p.num_heads <= 1 || p.gB_strideH % kMinimumAlignment == 0, + "attn_bias.grad is not correctly aligned (strideH)"); + TORCH_CHECK( + p.gB_strideM % kMinimumAlignment == 0, + "attn_bias.grad is not correctly aligned (strideM)"); + } + TORCH_CHECK( + !(p.cu_seqlens_q_ptr && p.bias_ptr), + "CuSeqlen + bias not implemented yet"); + TORCH_CHECK( + p.custom_mask_type < NumCustomMaskTypes, + "Invalid value for `custom_mask_type`"); + TORCH_CHECK( + p.dropout_prob <= 1.0f && p.dropout_prob >= 0.0f, + "Invalid value for `dropout_prob`"); + TORCH_CHECK( + kApplyDropout || p.dropout_prob == 0.0f, + "Set `kApplyDropout`=True to support `dropout_prob > 0`"); + TORCH_CHECK(p.head_dim > 0, "Invalid value for `head_dim`"); + TORCH_CHECK(p.head_dim_value > 0, "Invalid value for `head_dim_value`"); + TORCH_CHECK(p.num_queries > 0, "Invalid value for `num_queries`"); + TORCH_CHECK(p.num_keys > 0, "Invalid value for `num_keys`"); + TORCH_CHECK(p.num_heads > 0, "Invalid value for `num_heads`"); + TORCH_CHECK(p.num_batches > 0, "Invalid value for `num_batches`"); + TORCH_CHECK(p.head_dim <= kMaxK, "kMaxK: Expected `head_dim < kMaxK`"); + TORCH_CHECK( + p.head_dim_value <= kMaxK, "kMaxK: Expected `head_dim_value < kMaxK`"); + if (kKeysQueriesAlignedToBlockSize) { + TORCH_CHECK( + p.cu_seqlens_k_ptr == nullptr, + "This kernel does not support cu_seqlen"); + TORCH_CHECK( + p.cu_seqlens_q_ptr == nullptr, + "This kernel does not support cu_seqlen"); + TORCH_CHECK( + p.num_queries % kBlockSizeI == 0, + "kKeysQueriesAlignedToBlockSize condition not respected"); + TORCH_CHECK( + p.num_keys % kBlockSizeJ == 0, + "kKeysQueriesAlignedToBlockSize condition not respected"); + } + TORCH_CHECK( + kEnableSplitKeys || p.num_splits_key == 1, "SplitKeys is disabled"); + TORCH_CHECK( + p.num_splits_key > 0, "Invalid `num_splits_key` (expected >0)"); + TORCH_CHECK( + p.num_splits_key <= cutlass::ceil_div(p.num_keys, kBlockSizeJ), + "Invalid `num_splits_key` (", + p.num_splits_key, + ") - too large for `num_keys` = ", + p.num_keys); + if (p.window_size != 0) { + TORCH_CHECK( + p.custom_mask_type != NoCustomMask, + "LocalAttention only supported in causal mode"); + } + return true; + } + + static CUTLASS_DEVICE void attention_kernel(Params p) { + extern __shared__ char smem_buffer[]; + SharedStorage& shared_storage = *((SharedStorage*)smem_buffer); + + uint16_t thread_id = threadIdx.x; + uint8_t warp_id = warp_uniform(thread_id / 32); + uint8_t lane_id = thread_id % 32; + + int64_t key_start = p.split_key_device() * kBlockSizeJ; + if (key_start >= p.num_keys) { + return; + } + if (kPrologueQK) { + int64_t query_start = getQueryStart(p, key_start); + prologueQkNextIteration( + shared_storage, p, query_start, key_start, warp_id, lane_id); + } + + // Computes (dO*out).sum(-1) and writes it to `p.delta_ptr` + if (kKernelComputesDelta) { + constexpr int kOptimalElements = + 128 / cutlass::sizeof_bits::value; + if (p.head_dim_value % kOptimalElements == 0) { + for (int query_start = 0; query_start < p.num_queries; + query_start += kBlockSizeI) { + computeDelta(p, query_start, warp_id, lane_id); + } + } else { + for (int query_start = 0; query_start < p.num_queries; + query_start += kBlockSizeI) { + computeDelta<1>(p, query_start, warp_id, lane_id); + } + } + __syncthreads(); + } + + OutputFragments output_frags; + + curandStatePhilox4_32_10_t rng_state_init; + + if (kApplyDropout) { + // See Note [Seed and Offset Device] + auto const [seed, offset] = at::cuda::philox::unpack(p.rng_engine_inputs); + // each element of the attention matrix P with shape + // (batch_sz, n_heads, n_queries, n_keys) is associated with a single + // offset in RNG sequence. we initialize the RNG state with offset that + // starts at the beginning of a (n_queries, n_keys) matrix for this + // block's batch_id and head_id + // initializing rng state is very expensive, so we run once per kernel, + // rather than once per iteration. each iteration takes a copy of the + // initialized RNG state and offsets it as needed. + curand_init( + seed, + 0, + offset + p.dropout_batch_head_rng_offset, + &rng_state_init); + } + + CUTLASS_PRAGMA_UNROLL + for (; key_start < p.num_keys; + key_start += p.num_splits_key_device() * kBlockSizeJ) { + output_frags.clear(); + + int64_t next_key = key_start; + int64_t query_start = getQueryStart(p, key_start); + while (next_key == key_start && query_start < p.num_queries) { + // This line here + // vvvvvvvvvvvvvv + warp_id = warp_uniform(warp_id); + // ^^^^^^^^^^^^^^ + // ... makes everything use less RF and be 10% faster. Why? + // I don't know. My theory is that it forces `nvcc` to + // re-compute indices, offsets etc... and not keep them + // from the previous iteration, which prevents MASSIVE + // register spilling. + + processBlockIJ( + shared_storage, + output_frags, + p, + query_start, + key_start, + rng_state_init, + warp_id, + lane_id); + + int64_t next_query; + incrIteration(p, query_start, key_start, next_query, next_key); + query_start = next_query; + } + if (kOutputInRF) { + writeFragsToGmem( + shared_storage, output_frags, p, key_start, warp_id, lane_id); + } else if (getQueryStart(p, key_start) >= p.num_queries) { + zfillGradKV( + p, key_start, warp_id, lane_id); + } + __syncthreads(); + } + } + + template + static CUTLASS_DEVICE void zfillGradKV( + Params const& p, + int32_t key_start, + uint8_t warp_id, + uint8_t lane_id) { + constexpr int kThreadsPerKey = 8; + constexpr int kParallelKeys = kNumThreads / kThreadsPerKey; + static_assert(kBlockSizeJ % kParallelKeys == 0, ""); + // This function is not really optimized, but should rarely be used + // It's only used when some keys are "useless" and don't attend to + // any query, due to causal masking + + int thread_id = 32 * warp_id + lane_id; + int k_shift = lane_id % kThreadsPerKey; + + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < kBlockSizeJ; j += kParallelKeys) { + int key = key_start + j + (thread_id / kThreadsPerKey); + if (!skipBoundsChecks && key >= p.num_keys) { + continue; + } + auto gv_ptr = p.grad_value_ptr + key * p.gV_strideM(); + auto gk_ptr = p.grad_key_ptr + key * p.gK_strideM(); + + for (int k = k_shift; k < p.head_dim_value; k += kThreadsPerKey) { + gv_ptr[k] = scalar_t(0); + } + for (int k = k_shift; k < p.head_dim; k += kThreadsPerKey) { + gk_ptr[k] = scalar_t(0); + } + } + } + + template + static CUTLASS_DEVICE void processBlockIJ( + SharedStorage& shared_storage, + OutputFragments& output_frags, + Params& p, + int64_t query_start, + int64_t key_start, + const curandStatePhilox4_32_10_t& curand_state_init, + uint8_t warp_id, + uint8_t lane_id) { + cutlass::Array + dropout_keep_mask_doivj; + dropout_keep_mask_doivj.fill(cutlass::uint1b_t{1}); + const float dropout_scale = + kApplyDropout ? 1.0 / (1.0 - p.dropout_prob) : 1.0f; + + cutlass::MatrixCoord no_offset{0, 0}; + accum_t scale = p.scale; + int16_t thread_id = 32 * warp_id + lane_id; + + auto rematerializeThreadIds = [&]() { + // Prevents `nvcc` from keeping values deduced from + // `thread_id`, `warp_id`, ... in RF - to reduce register pressure + warp_id = warp_uniform(thread_id / 32); + lane_id = thread_id % 32; + thread_id = 32 * warp_id + lane_id; + }; + + bool isFirstQuery = (query_start == getQueryStart(p, key_start)); + int64_t next_query, next_key; + incrIteration(p, query_start, key_start, next_query, next_key); + bool isLastQuery = next_key != key_start; + + accum_t di_rf = accum_t(0); + if (thread_id < kBlockSizeI) { + if (query_start + thread_id < p.num_queries) { + di_rf = p.delta_ptr[query_start + thread_id]; + } + shared_storage.di()[thread_id] = di_rf; + } + + int32_t num_queries_in_block = skipBoundsChecks + ? MatmulQK::Mma::Shape::kN + : warp_uniform(cutlass::fast_min( + MatmulQK::Mma::Shape::kN, (int32_t)(p.num_queries - query_start))); + int32_t num_keys_in_block = skipBoundsChecks + ? MatmulQK::Mma::Shape::kM + : warp_uniform(cutlass::fast_min( + MatmulQK::Mma::Shape::kM, (int32_t)(p.num_keys - key_start))); + + auto prologueGradV = [&](int64_t col) { + typename MatmulGradV::Mma::IteratorB iterator_dO( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM + col), + {num_queries_in_block, (int32_t)(p.head_dim_value - col)}, + thread_id, + no_offset); + MatmulGradV::Mma::prologue( + shared_storage.mm_gradV(), + iterator_dO, + thread_id, + num_queries_in_block); + }; + auto prologueGradQ = [&](int col) { + typename MatmulGradQ::Mma::IteratorB iterator_K( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM + col), + {num_keys_in_block, p.head_dim - col}, + thread_id, + no_offset); + MatmulGradQ::Mma::prologue( + shared_storage.mm_gradQ(), iterator_K, thread_id, num_keys_in_block); + }; + auto prologueGradK = [&](int col) { + typename MatmulGradK::Mma::IteratorB iterator_Q( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM + col), + {num_queries_in_block, p.head_dim - col}, + thread_id, + no_offset); + MatmulGradK::Mma::prologue( + shared_storage.mm_gradK(), + iterator_Q, + thread_id, + num_queries_in_block); + }; + auto prologueDOV = [&]() { + typename MatmulDOIVJ::Mma::IteratorA iterator_A( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM), + {num_queries_in_block, p.head_dim_value}, + thread_id, + no_offset); + typename MatmulDOIVJ::Mma::IteratorB iterator_B( + {int32_t(p.v_strideM)}, + const_cast(p.value_ptr + key_start * p.v_strideM), + {p.head_dim_value, num_keys_in_block}, + thread_id, + no_offset); + MatmulDOIVJ::Mma::prologue( + shared_storage.mm_doivj(), + iterator_A, + iterator_B, + thread_id, + p.head_dim_value); + }; + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // MatmulQK + ///////////////////////////////////////////////////////////////////////////////////////////////// + { + using Mma = typename MatmulQK::Mma; + + cutlass::gemm::GemmCoord problem_size( + num_keys_in_block, + num_queries_in_block, + p.head_dim // k + ); + + // k_j + typename Mma::IteratorA iterator_A( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM), + {problem_size.m(), problem_size.k()}, + thread_id, + no_offset); + + // q_i.transpose(-2, -1) + typename Mma::IteratorB iterator_B( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM), + {problem_size.k(), problem_size.n()}, + thread_id, + no_offset); + + Mma mma( + shared_storage.mm_qk_k(), + shared_storage.mm_qk_q(), + thread_id, + warp_id, + lane_id); + + typename Mma::FragmentC accum; + + accum.clear(); + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + mma.set_prologue_done(kPrologueQK); + mma.set_zero_outside_bounds(!skipBoundsChecks); + mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); + accum = cutlass::multiplies()(scale, accum); + + // Epilogue: add LSE + exp and store that to our shared memory buffer + // shmem <- (matmul_result - + // logsumexp[i_start:i_end].unsqueeze(1)).exp() + int warp_idx_mn_0 = + warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN); + auto output_tile_coords = cutlass::MatrixCoord{ + warp_idx_mn_0 % Mma::Base::WarpCount::kM, + warp_idx_mn_0 / Mma::Base::WarpCount::kM}; + + // apply bias if applicable + if (p.bias_ptr != nullptr) { + // load bias tile Bij into shared memory + typename MatmulQK::BiasLoader::GmemTileIterator bias_iter( + {cutlass::layout::RowMajor(p.bias_strideM)}, + const_cast(p.bias_ptr + query_start * p.bias_strideM + key_start), + {num_queries_in_block, num_keys_in_block}, + thread_id); + cutlass::TensorRef bias_tensor_ref( + shared_storage.bias().data(), + cutlass::layout::RowMajor(MatmulQK::ThreadblockShape::kM)); + typename MatmulQK::BiasLoader::SmemTileIterator smem_tile_iter( + bias_tensor_ref, thread_id); + MatmulQK::BiasLoader::load(bias_iter, smem_tile_iter); + + // Pij += Bij, where Pij is in register fragment and Bij is in shmem + auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + MatmulQK::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_n) {}, + [&](int accum_m, int accum_n, int idx) { + // remember we are transposed + accum[idx] += bias_tensor_ref.at({accum_n, accum_m}); + }, + [&](int accum_n) {}); + } + + // Apply mask + if (p.custom_mask_type == CausalFromTopLeft || + p.custom_mask_type == CausalFromBottomRight) { + auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + int shift = query_start - key_start; + if (p.custom_mask_type == CausalFromBottomRight) { + shift += p.num_keys - p.num_queries; + } + // current_key = key_start + accum_m + // current_query = query_start + accum_n + // mask if: `current_key > current_query` + MatmulQK::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (accum_m > accum_n + shift) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + } + if (p.window_size > 0) { + auto lane_offset = MatmulQK::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + int shift = query_start - key_start - p.window_size; + // current_key = key_start + accum_m + // current_query = query_start + accum_n + // mask if: `current_key < current_query - window_size` + // if accum_m < accum_n + query_start - window_size - key_start + + MatmulQK::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (accum_m <= accum_n + shift) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + } + __syncthreads(); + if (kPrologueGV) { + prologueGradV(0); + } + if (kPrologueDOV) { + prologueDOV(); + } + + MatmulQK::B2bGemm::accumApplyLSEToSmem( + shared_storage.attn_shared_storage(), + accum, + p.logsumexp_ptr + query_start, + problem_size.n(), + thread_id, + warp_id, + lane_id, + output_tile_coords); +#if 0 + auto accum_ref_attnT = shared_storage.attn_shared_storage().accum_ref(); + PRINT_TENSOR4x4_T0_L0("attn_T", accum_ref_attnT); +#endif + + // if we are using dropout, compute Zij, writing it to shared memory. + // each element of Zij is: + // - 0 with probability dropout_p + // - 1 / (1 - dropout_p) with probability 1 - dropout_p + if (kApplyDropout) { + auto zij = shared_storage.zij().accum_ref(); + // each thread generates a contiguous sequence of elements in Zij, all + // in the same row. the reason they have to come from the same row is + // that sampling random numbers from a contiguous random number sequence + // is much more efficient than jumping around, and the linear offset of + // each element of Z (the global matrix) maps to an offset in a random + // number sequence. for Z, the end of a row and the beginning of the + // next have adjacent offsets, but for Zij (tile of global matrix), this + // is not necessarily the case. + // We must fill the entire `zij` shmem with values (even out of bounds + // on the K-dimension) otherwise we can get NaNs during the GEMM + const int kQueriesPerBlock = kBlockSizeI; + const int threads_per_row = cutlass::fast_min( + kNumThreads / kQueriesPerBlock, (int64_t)num_keys_in_block); + const int elts_per_thread = cutlass::round_nearest( + cutlass::ceil_div(num_keys_in_block, threads_per_row), 4); + + const int thread_i = thread_id / threads_per_row; + const int thread_start_j = + (thread_id % threads_per_row) * elts_per_thread; + + if (thread_i < kQueriesPerBlock && thread_start_j < num_keys_in_block) { + curandStatePhilox4_32_10_t curand_state = curand_state_init; + skipahead( + (query_start + thread_i) * p.num_keys + + (key_start + thread_start_j), + &curand_state); + + // generate elements of Zij, 4 elements at a time + for (int zij_start_col_idx = thread_start_j; zij_start_col_idx < + cutlass::fast_min(thread_start_j + elts_per_thread, + num_keys_in_block); + zij_start_col_idx += 4) { + const float4 rand_uniform_quad = curand_uniform4(&curand_state); + + CUTLASS_PRAGMA_UNROLL + for (int quad_idx = 0; quad_idx < 4; ++quad_idx) { + // we'll write Zij transposed since attention is also transposed + // during the matmul to compute dV. + zij.at({zij_start_col_idx + quad_idx /*k*/, thread_i /*q*/}) = + (&rand_uniform_quad.x)[quad_idx] > p.dropout_prob + ? scalar_t(dropout_scale) + : scalar_t(0); + } + } + } + __syncthreads(); +#if 0 + PRINT_TENSOR4x4_T0_L0("zij", zij); + PRINT_TENSOR4x4_T0_L0_START("zij", zij, kBlockSizeJ - 4, kBlockSizeI - 4); +#endif + + // Save mask for later DOIVJ matmul + + int warp_idx_mn_0 = warp_id % + (MatmulDOIVJ::Mma::Base::WarpCount::kM * + MatmulDOIVJ::Mma::Base::WarpCount::kN); + auto output_tile_coords_doivj = cutlass::MatrixCoord{ + warp_idx_mn_0 % MatmulDOIVJ::Mma::Base::WarpCount::kM, + warp_idx_mn_0 / MatmulDOIVJ::Mma::Base::WarpCount::kM}; + auto lane_offset = MatmulDOIVJ::AccumLambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords_doivj); + MatmulDOIVJ::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m /*q*/, int accum_n /*k*/, int idx) { + if (zij.at({accum_n, accum_m}) == scalar_t(0)) { + dropout_keep_mask_doivj[idx] = cutlass::uint1b_t{0}; + } + }, + [&](int accum_m) {}); + } + __syncthreads(); + } + rematerializeThreadIds(); + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // GradV matmul + // + // grad_v[j_start:j_end] += attn_T @ do_i + ///////////////////////////////////////////////////////////////////////////////////////////////// + constexpr bool kSingleIterationGradV = + kMaxK <= MatmulGradV::ThreadblockShape::kN; + for (int32_t col = 0; col < (kSingleIterationGradV ? 1 : p.head_dim_value); + col += MatmulGradV::ThreadblockShape::kN) { + using Mma = typename MatmulGradV::Mma; + using AccumTileGmem = typename MatmulGradQ::AccumTileGmem; + + cutlass::gemm::GemmCoord problem_size( + num_keys_in_block, p.head_dim_value - col, num_queries_in_block); + auto createEpilogueIter = [&]() { + return typename MatmulGradV::OutputTileIterator( + typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()}, + p.grad_value_ptr + key_start * p.gV_strideM() + col, + {num_keys_in_block, p.head_dim_value - col}, + thread_id); + }; + typename Mma::IteratorB iterator_B( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM + col), + {num_queries_in_block, p.head_dim_value - col}, + thread_id, + no_offset); + + // if dropout: dVj += (Pij.T * Zij) @ dOi + // otherwise: dVj += Pij.T @ dOi + Mma mma( + // operand A: Pij.T + shared_storage.attn_shared_storage().accum_ref(), + // operand A_scale Zij.T: + // if we're using dropout, operand A is Pij_dropped.T = Pij.T * Zij.T + // which is computed on the fly as fragments of Pij.T are loaded in + shared_storage.zij().accum_ref(), + // operand B: dOi - which was loaded into shared memory previously + // when we computed dVj + shared_storage.mm_gradV().operand_B_ref(), + thread_id, + warp_id, + lane_id); + + int storage_id = col / MatmulGradV::ThreadblockShape::kN; + AccumTileGmem gmem_tile{ + p.workspace_gv + storage_id * AccumTileGmem::kElementsStored}; + if (!kOutputInRF) { + if (isFirstQuery || !kNeedsAccumGradV) { + output_frags.gradV.clear(); + } else { + gmem_tile.load(output_frags.gradV, thread_id); + } + } + mma.set_prologue_done(kPrologueGV); + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + __syncthreads(); + + mma(gemm_k_iterations, + output_frags.gradV, + iterator_B, + output_frags.gradV); + __syncthreads(); + if (kPrologueGV && !kSingleIterationGradV && + col + MatmulGradV::ThreadblockShape::kN < p.head_dim_value) { + prologueGradV(col + MatmulGradV::ThreadblockShape::kN); + } + + if (!kOutputInRF) { + if (kNeedsAccumGradV && !isLastQuery) { + gmem_tile.store(output_frags.gradV, thread_id); + } else { + accumulateInGmem( + shared_storage.gradV_epilogue(), + output_frags.gradV, + createEpilogueIter(), + isFirstQuery || kNeedsAccumGradV, + warp_id, + lane_id); + } + } + } + __syncthreads(); + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // MatmulDOIVJ + ///////////////////////////////////////////////////////////////////////////////////////////////// + { + using Mma = typename MatmulDOIVJ::Mma; + // do_i + typename Mma::IteratorA iterator_A( + {int32_t(p.gO_strideM)}, + const_cast(p.grad_output_ptr + query_start * p.gO_strideM), + {num_queries_in_block, p.head_dim_value}, + thread_id, + no_offset); + + // v_j.transpose(-2, -1) + typename Mma::IteratorB iterator_B( + {int32_t(p.v_strideM)}, + const_cast(p.value_ptr + key_start * p.v_strideM), + {p.head_dim_value, num_keys_in_block}, + thread_id, + no_offset); + + Mma mma(shared_storage.mm_doivj(), thread_id, warp_id, lane_id); + mma.set_prologue_done(kPrologueDOV); + mma.set_zero_outside_bounds(!skipBoundsChecks); + + typename Mma::FragmentC accum; + + accum.clear(); + + auto gemm_k_iterations = + (p.head_dim_value + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); + __syncthreads(); + if (kPrologueGQ) { + prologueGradQ(0); + } + if (kPrologueGK) { + prologueGradK(0); + } + + int warp_idx_mn_0 = + warp_id % (Mma::Base::WarpCount::kM * Mma::Base::WarpCount::kN); + auto output_tile_coords = cutlass::MatrixCoord{ + warp_idx_mn_0 % Mma::Base::WarpCount::kM, + warp_idx_mn_0 / Mma::Base::WarpCount::kM}; + // TODO: This must be terribly inefficient. There must be a better way + // tmp [RF] <- (accum [RF] - Di [smem] ) * attn_T.T [smem] + // attn_shared_storage [smem] <- tmp.T + // tmp_shared_storage [smem] <- tmp + { + using LambdaIterator = typename MatmulDOIVJ::AccumLambdaIterator; + auto lane_offset = LambdaIterator::get_lane_offset( + lane_id, warp_id, output_tile_coords); + // if dropout was used, compute dPij = dPij_dropped * Zij + if (kApplyDropout) { + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (dropout_keep_mask_doivj[idx].get()) { + accum[idx] *= dropout_scale; + } else { + accum[idx] = 0; + } + }, + [&](int accum_m) {}); + } + + auto attn_T = shared_storage.attn_shared_storage().accum_ref(); +#if 0 + PRINT_B0_T0("doivj_dropped"); + print_warp_accum(accum, lane_offset, 4, 4); + PRINT_TENSOR4x4_T0_L0("attn_T", attn_T) +#endif + accum_t current_di; + // dSij = (dPij - Di) * Pij + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { current_di = shared_storage.di()[accum_m]; }, + [&](int accum_m, int accum_n, int idx) { + // TODO: Otherwise we can get nans as we + // might have infs here (only seen on f16 tho) + if (skipBoundsChecks || + (accum_m < num_queries_in_block && + accum_n < num_keys_in_block)) { + accum_t attn = attn_T.at({accum_n, accum_m}); + accum[idx] = (accum[idx] - current_di) * attn; + } else { + accum[idx] = 0; + } + }, + [&](int accum_m) { + + }); + + // store bias gradient tile dBij to global memory, + // where dBij = dSij = Pij * (dPij - Di) + if (p.grad_bias_ptr != nullptr) { + typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator + output_iter( + typename MatmulDOIVJ::BiasGradEpilogue::OutputTileIterator:: + Params{p.gB_strideM}, + // grad_bias_ptr is offset to point at beginning of + // matrix of shape (queries, keys) for a given + // (batch_id, head_id) the pointer arithmetic here produces + // a pointer to the start of the current tile within that + // matrix + p.grad_bias_ptr + query_start * p.gB_strideM + key_start, + {num_queries_in_block, num_keys_in_block}, + thread_id); + + // no-op epilogue operator - just casting and storing contents of + // accum to global memory + typename MatmulDOIVJ::BiasGradEpilogue::OutputOp output_op({1, 1}); + typename MatmulDOIVJ::BiasGradEpilogue epilogue( + shared_storage.gradB_epilogue(), thread_id, warp_id, lane_id); + epilogue(output_op, output_iter, accum, output_iter); + } + + accum = accum * scale; + +#if 0 + PRINT_B0_T0("(doivj - di) * attn * scale"); + print_warp_accum(accum, lane_offset, 4, 4); +#endif + + __syncthreads(); + if (!MatmulGradK::DefaultMmaFromSmem::kIsTransposedA) { + auto tmpT = shared_storage.tmpT_shared_storage().accum_ref(); + // attn <- attn_T.T + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + tmpT.at({accum_n, accum_m}) = scalar_t(accum[idx]); + }, + [&](int accum_m) {}); + } + } + + MatmulDOIVJ::B2bGemm::accumToSmem( + shared_storage.tmp_shared_storage(), + accum, + lane_id, + output_tile_coords); + __syncthreads(); + } + // Force `nvcc` to recompute values that depend on the variables just below + // to use less RF and prevent some spilling + p.head_dim = warp_uniform(p.head_dim); + p.k_strideM = warp_uniform(p.k_strideM); + rematerializeThreadIds(); + + ///////////////////////////////////////////////////////////////////////////////////////////////// + // GradQ matmul + // + // grad_q[i_start:i_end] += tmp @ k_j + ///////////////////////////////////////////////////////////////////////////////////////////////// + // Skip the loop & associated branches if we know at compile time the number + // of iterations + constexpr bool kSingleIterationGradQ = + kMaxK <= MatmulGradQ::ThreadblockShape::kN; + for (int col = 0; col < (kSingleIterationGradQ ? 1 : p.head_dim); + col += MatmulGradQ::ThreadblockShape::kN) { + using Mma = typename MatmulGradQ::Mma; + using AccumTileGmem = typename MatmulGradQ::AccumTileGmem; + + cutlass::gemm::GemmCoord problem_size( + num_queries_in_block, + false ? MatmulGradQ::ThreadblockShape::kN : p.head_dim - col, + num_keys_in_block); + + // k_j + typename Mma::IteratorB iterator_B( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM + col), + {problem_size.k(), problem_size.n()}, + thread_id, + no_offset); + + auto a = shared_storage.tmp_shared_storage().accum_ref(); + Mma mma( + // operand A: dSij + shared_storage.tmp_shared_storage().accum_ref(), + // operand B: Kj + shared_storage.mm_gradQ().operand_B_ref(), + thread_id, + warp_id, + lane_id); + + typename Mma::FragmentC accum; + + int col_id = col / MatmulGradQ::ThreadblockShape::kN; + int num_cols = kSingleIterationGradQ + ? 1 + : ceil_div(p.head_dim, MatmulGradQ::ThreadblockShape::kN); + int storage_id = (col_id + query_start / kBlockSizeI * num_cols); + + if (p.num_splits_key_device() > 1) { + AtomicLock::acquire( + &p.workspace_gq[storage_id].lock, + p.split_key_device() + 1, + thread_id); + // Make sure we can see other block's output + __threadfence(); + } + + AccumTileGmem gmem_tile{&p.workspace_gq[storage_id].buffer[0]}; + if (!kNeedsAccumGradQ || + (p.num_splits_key_device() == 1 && key_start == 0)) { + // if we know we are the first to access it, we know it's only zeros. + // Avoids a load from gmem (and gmem init as well) + accum.clear(); + } else { + gmem_tile.load(accum, thread_id); + } + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + __syncthreads(); + mma.set_prologue_done(kPrologueGQ); + mma(gemm_k_iterations, accum, iterator_B, accum); + __syncthreads(); + bool isLastColumn = kSingleIterationGradQ || + (col + MatmulGradQ::ThreadblockShape::kN >= p.head_dim); + if (kPrologueGQ && !isLastColumn) { + prologueGradQ(col + MatmulGradQ::ThreadblockShape::kN); + } + + bool isLast = [&]() { + int32_t next_key = key_start + p.num_splits_key_device() * kBlockSizeJ; + if (p.num_keys <= next_key) { + return true; + } + if (query_start < getSmallestQueryForKey(p, next_key)) { + return true; + } + return false; + }(); + // Output results + if (p.num_splits_key_device() > 1) { + int32_t numAddsSoFar = -1; + if (isLast && thread_id == 0) { + numAddsSoFar = atomicAdd(&p.workspace_gq[storage_id].counter, 1) + + 1; // `atomicAdd` returns the old value + } + isLast = __syncthreads_or( + numAddsSoFar == getNumParallelBlocksForQuery(p, query_start)); + assert(numAddsSoFar <= getNumParallelBlocksForQuery(p, query_start)); + } + if (kNeedsAccumGradQ && !isLast) { + gmem_tile.store(accum, thread_id); + if (p.num_splits_key_device() > 1) { + // Make sure everyone wrote before we release the lock + __threadfence(); + __syncthreads(); + AtomicLock::release(&p.workspace_gq[storage_id].lock, thread_id); + } + } else { + // NOTE: We're not releasing the lock because no one is expected + // to come after us (we're the last one to write) + typename MatmulGradQ::OutputTileIterator output_it( + typename MatmulGradQ::OutputTileIterator::Params{p.gQ_strideM()}, + p.grad_query_ptr + query_start * p.gQ_strideM() + col, + {problem_size.m(), problem_size.n()}, + thread_id); + // if `direct_store` is True, we store to gmem (`*gmem = accum`) + // otherwise, we accumulate in gmem (`*gmem = *gmem + accum`) + // If we know ahead of time when we will write for the first time + // we can: + // (1) Avoid an additional memory read + // (2) Avoid the cost of initializing memory to 0 + bool direct_store = kNeedsAccumGradQ || key_start == 0 || + (p.num_splits_key_device() > 1); + accumulateInGmem( + isLastColumn ? shared_storage.gradQ_epilogue_lastIter() + : shared_storage.gradQ_epilogue(), + accum, + output_it, + direct_store, + warp_id, + lane_id); + } + } + ///////////////////////////////////////////////////////////////////////////////////////////////// + // GradK matmul + // + // grad_k[i_start:i_end] += tmp.transpose(-2, -1) @ q_i + ///////////////////////////////////////////////////////////////////////////////////////////////// + rematerializeThreadIds(); + + constexpr bool kSingleIterationGradK = + kMaxK <= MatmulGradK::ThreadblockShape::kN; + for (int col = 0; col < (kSingleIterationGradK ? 1 : p.head_dim); + col += MatmulGradK::ThreadblockShape::kN) { + using Mma = typename MatmulGradK::Mma; + using AccumTileGmem = typename MatmulGradQ::AccumTileGmem; + + cutlass::gemm::GemmCoord problem_size( + num_keys_in_block, + false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col, + num_queries_in_block); + auto createEpilogueIter = [&]() { + return typename MatmulGradK::OutputTileIterator( + typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()}, + p.grad_key_ptr + key_start * p.gK_strideM() + col, + {num_keys_in_block, + false ? MatmulGradK::ThreadblockShape::kN : p.head_dim - col}, + thread_id); + }; + + // q_i + typename Mma::IteratorB iterator_B( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM + col), + {problem_size.k(), problem_size.n()}, + thread_id, + no_offset); + + auto getTmp = [&](int) { return &shared_storage.tmp_shared_storage(); }; + auto getTmpT = [&](int) { return &shared_storage.tmpT_shared_storage(); }; + // this is basically: + // opA = kIsTransposedA ? getTmp() : getTmpT(); + bool constexpr kIsTransposedA = + MatmulGradK::DefaultMmaFromSmem::kIsTransposedA; + auto& opA = *call_conditional< + kIsTransposedA, + decltype(getTmp), + decltype(getTmpT)>::apply(getTmp, getTmpT, 0); + Mma mma( + // operand A: dSij.T + opA.accum_ref(), + // operand B: Qi + shared_storage.mm_gradK().operand_B_ref(), + thread_id, + warp_id, + lane_id); + + int storage_id = col / MatmulGradK::ThreadblockShape::kN; + AccumTileGmem gmem_tile{ + p.workspace + storage_id * AccumTileGmem::kElementsStored}; + if (!kOutputInRF) { + if (isFirstQuery || !kNeedsAccumGradK) { + output_frags.gradK.clear(); + } else { + gmem_tile.load(output_frags.gradK, thread_id); + } + } + mma.set_prologue_done(kPrologueGK); + + auto gemm_k_iterations = + (problem_size.k() + Mma::Shape::kK - 1) / Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + __syncthreads(); + + mma(gemm_k_iterations, + output_frags.gradK, + iterator_B, + output_frags.gradK); + __syncthreads(); + bool isLastColumn = kSingleIterationGradK || + col + MatmulGradK::ThreadblockShape::kN >= p.head_dim; + if (kPrologueGK && !isLastColumn) { + prologueGradK(col + MatmulGradK::ThreadblockShape::kN); + } + + if (kPrologueQK && isLastColumn) { + int64_t next_query, next_key; + incrIteration(p, query_start, key_start, next_query, next_key); + DISPATCH_BOOL( + next_key != key_start, kForceReloadK, ([&]() { + prologueQkNextIteration( + shared_storage, p, next_query, next_key, warp_id, lane_id); + })); + } + + // Output results + if (!kOutputInRF) { + if (kNeedsAccumGradK && !isLastQuery) { + gmem_tile.store(output_frags.gradK, thread_id); + } else { + accumulateInGmem( + isLastColumn ? shared_storage.gradK_epilogue_final() + : shared_storage.gradK_epilogue(), + output_frags.gradK, + createEpilogueIter(), + isFirstQuery || kNeedsAccumGradK, + warp_id, + lane_id); + __syncthreads(); + } + } + } + } + + static CUTLASS_HOST_DEVICE int64_t getQueryStartShift(Params const& p) { + if (p.custom_mask_type == NoCustomMask && p.num_splits_key_device() > 1) { + return (p.split_key_device() * kBlockSizeI) % getQueryEnd(p); + } + return 0; + } + + // Iteration order logic + static CUTLASS_HOST_DEVICE int64_t + getQueryStart(Params const& p, int64_t key_start) { + return getSmallestQueryForKey(p, key_start) + getQueryStartShift(p); + }; + static CUTLASS_HOST_DEVICE int64_t getQueryEnd(Params const& p) { + return align_up(p.num_queries, kBlockSizeI); + }; + + static CUTLASS_HOST_DEVICE int64_t + getSmallestQueryForKey(Params const& p, int64_t key_start) { + if (p.custom_mask_type == NoCustomMask) { + return 0; + } + int64_t shift = p.custom_mask_type == CausalFromBottomRight + ? p.num_keys - p.num_queries + : 0; + int64_t window_size = + p.window_size == 0 ? p.num_queries + p.num_keys : p.window_size; + + auto last_key_for_block = + cutlass::fast_min(key_start + kBlockSizeJ, (int64_t)p.num_keys) - 1; + int first_query = key_start - shift; + int last_query = last_key_for_block - shift + window_size - 1; + if (last_query < 0 || first_query >= p.num_queries) { + return getQueryEnd(p); // nothing to compute in this column + } + first_query = cutlass::fast_max(0, first_query); + return (first_query / kBlockSizeI) * kBlockSizeI; + }; + + // Returns how many kernel blocks will write to a given block in `grad_query` + // This is usually equal to the number of key splits, but can be different + // for instance in the causal case, or varying seqlen + static CUTLASS_HOST_DEVICE int32_t + getNumParallelBlocksForQuery(Params const& p, int32_t query_start) { + int16_t num_key_blocks = ceil_div(p.num_keys, kBlockSizeJ); + if (p.custom_mask_type != NoCustomMask) { + int32_t shift = p.custom_mask_type == CausalFromBottomRight + ? p.num_keys - p.num_queries + : 0; + int32_t last_query_for_block = + cutlass::fast_min(query_start + kBlockSizeI, p.num_queries) - 1; + int32_t last_key_for_block = + cutlass::fast_min(last_query_for_block + shift, p.num_keys - 1); + int32_t first_key_for_block = p.window_size == 0 + ? 0 + : cutlass::fast_max(query_start - p.window_size + 1 + shift, 0); + + if (p.window_size == 0) { + num_key_blocks = last_key_for_block / kBlockSizeJ + 1; + } else { + num_key_blocks = (last_key_for_block / kBlockSizeJ) - + (first_key_for_block / kBlockSizeJ) + 1; + } + + if (last_key_for_block < 0 || first_key_for_block >= p.num_keys) { + num_key_blocks = 0; + } + } + return cutlass::fast_min(p.num_splits_key_device(), num_key_blocks); + }; + + // Returns the next block to process + static CUTLASS_HOST_DEVICE void incrIteration( + Params const& p, + int64_t query_start, + int64_t key_start, + int64_t& next_query, + int64_t& next_key) { + next_query = query_start + kBlockSizeI; + next_key = key_start; + auto query_shift = getQueryStartShift(p); + // Wrap around + if (query_shift) { + if (next_query >= p.num_queries) { + next_query = getSmallestQueryForKey(p, key_start); + return; + } else if (query_start < query_shift && query_shift <= next_query) { + // jump to next key + } else { + return; + } + } else { + if (p.window_size > 0) { + int32_t shift = p.custom_mask_type == CausalFromBottomRight + ? p.num_keys - p.num_queries + : 0; + // last key that is not masked out + int last_key_for_block = + cutlass::fast_min(key_start + kBlockSizeJ, (int64_t)p.num_keys) - 1; + int last_query = last_key_for_block - shift + p.window_size - 1; + if (next_query <= last_query && next_query < p.num_queries) { + return; + } + } else if (next_query < p.num_queries) { + return; + } + // jump to next key + } + // Next key + next_key = key_start + p.num_splits_key_device() * (int64_t)kBlockSizeJ; + next_query = getQueryStart(p, next_key); + } + + template + static CUTLASS_DEVICE void prologueQkNextIteration( + SharedStorage& shared_storage, + Params const& p, + int32_t query_start, + int32_t key_start, + uint8_t warp_id, + uint8_t lane_id) { + if (query_start >= p.num_queries || key_start >= p.num_keys) { + return; + } + + static constexpr bool kReloadK = + kForceReloadK || !MatmulQK::Mma::kSmemContainsEntireMat; + int thread_id = 32 * warp_id + lane_id; + typename MatmulQK::Mma::IteratorA iterator_A( + {int32_t(p.k_strideM)}, + const_cast(p.key_ptr + key_start * p.k_strideM), + {p.num_keys - key_start, p.head_dim}, + thread_id, + cutlass::MatrixCoord{0, 0}); + + typename MatmulQK::Mma::IteratorB iterator_B( + {int32_t(p.q_strideM)}, + const_cast(p.query_ptr + query_start * p.q_strideM), + {p.head_dim, p.num_queries - query_start}, + thread_id, + cutlass::MatrixCoord{0, 0}); + + MatmulQK::Mma::prologue( + shared_storage.mm_qk_k(), + shared_storage.mm_qk_q(), + iterator_A, + iterator_B, + thread_id, + p.head_dim); + } + + template + static CUTLASS_DEVICE void writeFragsToGmem( + SharedStorage& shared_storage, + OutputFragments& output_frags, + Params const& p, + int32_t key_start, + uint8_t warp_id, + uint8_t lane_id) { + uint16_t thread_id = 32 * warp_id + lane_id; + int32_t num_keys_in_block = skipBoundsChecks + ? MatmulQK::Mma::Shape::kM + : cutlass::fast_min( + MatmulQK::Mma::Shape::kM, p.num_keys - key_start); + typename MatmulGradV::OutputTileIterator outputV_it( + typename MatmulGradV::OutputTileIterator::Params{p.gV_strideM()}, + p.grad_value_ptr + key_start * p.gV_strideM(), + {num_keys_in_block, p.head_dim_value}, + thread_id); + accumulateInGmem( + shared_storage.gradV_epilogue_final(), + output_frags.gradV, + outputV_it, + true, + warp_id, + lane_id); + + typename MatmulGradK::OutputTileIterator outputK_it( + typename MatmulGradK::OutputTileIterator::Params{p.gK_strideM()}, + p.grad_key_ptr + key_start * p.gK_strideM(), + {num_keys_in_block, + false ? MatmulGradK::ThreadblockShape::kN : p.head_dim}, + thread_id); + accumulateInGmem( + shared_storage.gradK_epilogue_final(), + output_frags.gradK, + outputK_it, + true, + warp_id, + lane_id); + } + + template + static CUTLASS_DEVICE void accumulateInGmem( + typename MatmulT::DefaultEpilogue::SharedStorage& epilogue_smem, + typename MatmulT::Mma::FragmentC const& accum, + typename MatmulT::OutputTileIterator output_it, + bool first, + uint8_t warp_id, + uint8_t lane_id) { + using DefaultEpilogue = typename MatmulT::DefaultEpilogue; + using DefaultOutputOp = typename MatmulT::DefaultOutputOp; + using Mma = typename MatmulT::Mma; + int thread_id = 32 * warp_id + lane_id; + DISPATCH_BOOL( + first, kIsFirst, ([&]() { + static constexpr auto ScaleType = kIsFirst + ? cutlass::epilogue::thread::ScaleType::Nothing + : cutlass::epilogue::thread::ScaleType::NoBetaScaling; + using EpilogueOutputOp = + typename cutlass::epilogue::thread::LinearCombination< + typename DefaultOutputOp::ElementOutput, + DefaultOutputOp::kCount, + typename DefaultOutputOp::ElementAccumulator, + typename DefaultOutputOp::ElementCompute, + ScaleType>; + using Epilogue = + typename cutlass::epilogue::threadblock::EpiloguePipelined< + typename DefaultEpilogue::Shape, + typename Mma::Operator, + DefaultEpilogue::kPartitionsK, + typename MatmulT::OutputTileIterator, + typename DefaultEpilogue::AccumulatorFragmentIterator, + typename DefaultEpilogue::WarpTileIterator, + typename DefaultEpilogue::SharedLoadIterator, + EpilogueOutputOp, + typename DefaultEpilogue::Padding, + DefaultEpilogue::kFragmentsPerIteration, + true // IterationsUnroll + >; + EpilogueOutputOp rescale({1, 1}); + Epilogue epilogue(epilogue_smem, thread_id, warp_id, lane_id); + epilogue(rescale, output_it, accum, output_it); + })); + } + + template + static CUTLASS_DEVICE void computeDelta( + Params const& p, + int32_t query_start, + uint8_t warp_id, + uint8_t lane_id) { + // Each thread computes one value for Delta + // Depending on warp configuration, we might have multiple + // threads of the same warp working on the same row + using AccessType = cutlass::Array; + static_assert(kNumThreads >= kBlockSizeI, ""); + static constexpr int kNumThreadsPerLine = kNumThreads / kBlockSizeI; + int16_t thread_id = 32 * warp_id + lane_id; + + int16_t laneFirstCol = kElementsPerAccess * (lane_id % kNumThreadsPerLine); + int16_t laneRow = thread_id / kNumThreadsPerLine; + bool rowPred = (query_start + laneRow) < p.num_queries; + bool pred = rowPred; + + // on windows, previous syntax __restrict__ AccessType* + // resulted in error: "restrict" is not allowed + const AccessType* __restrict__ grad_output_ptr = + reinterpret_cast( + p.grad_output_ptr + (query_start + laneRow) * p.gO_strideM + + laneFirstCol); + const AccessType* __restrict__ output_ptr = + reinterpret_cast( + p.output_ptr + (query_start + laneRow) * p.o_strideM() + + laneFirstCol); + + static constexpr int64_t kMaxIters = + kMaxK / (kElementsPerAccess * kNumThreadsPerLine); + constexpr int kPipelineStages = 2; + accum_t delta_value = accum_t(0); + using GlobalLoad = + cutlass::arch::global_load; + AccessType frag_grad_output[kPipelineStages]; + AccessType frag_output[kPipelineStages]; + + auto loadAndIncrement = [&](int ld_pos, bool is_valid) { + frag_grad_output[ld_pos].clear(); + frag_output[ld_pos].clear(); + GlobalLoad(frag_grad_output[ld_pos], grad_output_ptr, is_valid); + GlobalLoad(frag_output[ld_pos], output_ptr, is_valid); + grad_output_ptr += kNumThreadsPerLine; + output_ptr += kNumThreadsPerLine; + }; + + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < kPipelineStages - 1; ++iter) { + int ld_pos = iter % kPipelineStages; + pred = pred && + (laneFirstCol + iter * kElementsPerAccess * kNumThreadsPerLine) < + p.head_dim_value; + loadAndIncrement(ld_pos, pred); + } + auto columnIteration = [&](int iter) { + // Load for next iter + int ld_pos = (iter + kPipelineStages - 1) % kPipelineStages; + pred = pred && + (laneFirstCol + + (iter + kPipelineStages - 1) * kElementsPerAccess * + kNumThreadsPerLine) < p.head_dim_value; + loadAndIncrement(ld_pos, pred); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < AccessType::kElements; ++i) { + delta_value += accum_t(frag_output[iter % kPipelineStages][i]) * + accum_t(frag_grad_output[iter % kPipelineStages][i]); + } + }; + + // If we have a small lower-bound for K, we can unroll the loop + if (kMaxK <= 256) { + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < kMaxIters; ++iter) { + columnIteration(iter); + } + } else { + int num_iters = + ceil_div(p.head_dim_value, kElementsPerAccess * kNumThreadsPerLine) * + (kElementsPerAccess * kNumThreadsPerLine); + for (int iter = 0; iter < num_iters; ++iter) { + columnIteration(iter); + } + } + + // Reduce between workers + static_assert( + kNumThreadsPerLine == 1 || kNumThreadsPerLine == 2 || + kNumThreadsPerLine == 4, + ""); + CUTLASS_PRAGMA_UNROLL + for (int i = 1; i < kNumThreadsPerLine; i *= 2) { + delta_value = delta_value + __shfl_xor_sync(0xffffffff, delta_value, i); + } + + // Store in gmem + if (rowPred) { + p.delta_ptr[query_start + laneRow] = delta_value; + } + } +}; + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_backward_batched_impl(typename AK::Params p) { + if (!p.advance_to_block()) { + return; + } + AK::attention_kernel(p); +} + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_backward_batched(typename AK::Params params); + +} // namespace PyTorchMemEffAttention + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h new file mode 100644 index 0000000000000000000000000000000000000000..3075286eb0ca590303e37e84429ba977e39fd69e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernel_forward.h @@ -0,0 +1,1358 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +using namespace gemm_kernel_utils; + +namespace PyTorchMemEffAttention { +namespace { +template +constexpr int getWarpsPerSmFw() { + return ( + Arch::kMinComputeCapability >= 80 && + !cutlass::platform::is_same::value + ? 16 + : 12); +} +static CUTLASS_DEVICE float atomicMaxFloat(float* addr, float value) { + // source: https://stackoverflow.com/a/51549250 + return !signbit(value) + ? __int_as_float(atomicMax((int *)addr, __float_as_int(value))) + : __uint_as_float( + atomicMin((unsigned int *)addr, __float_as_uint(value))); +} +} // namespace + +template < + // The datatype of Q/K/V + typename scalar_t_, + // Architecture we are targeting (eg `cutlass::arch::Sm80`) + typename ArchTag, + // If Q/K/V are correctly aligned in memory and we can run a fast kernel + bool isAligned_, + int kQueriesPerBlock_, + int kKeysPerBlock_, + // upperbound on `max(value.shape[-1], query.shape[-1])` + int kMaxK_ = (int)cutlass::platform::numeric_limits::max(), + // This is quite slower on V100 for some reason + // Set to false if you know at compile-time you will never need dropout + bool kSupportsDropout_ = true, + bool kSupportsBias_ = true> +struct AttentionKernel { + enum CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, + }; + + using scalar_t = scalar_t_; + using accum_t = float; + using lse_scalar_t = float; + using output_t = scalar_t; + // Accumulator between 2 iterations + // Using `accum_t` improves perf on f16 at the cost of + // numerical errors + using output_accum_t = accum_t; + static constexpr bool kSupportsDropout = kSupportsDropout_; + static constexpr bool kSupportsBias = kSupportsBias_; + static constexpr int kKeysPerBlock = kKeysPerBlock_; + static constexpr int kQueriesPerBlock = kQueriesPerBlock_; + static constexpr int kMaxK = kMaxK_; + static constexpr bool kIsAligned = isAligned_; + static constexpr bool kSingleValueIteration = kMaxK <= kKeysPerBlock; + static constexpr int32_t kAlignLSE = 32; // block size of backward + static constexpr bool kIsHalf = cutlass::sizeof_bits::value == 16; + static constexpr bool kPreloadV = + ArchTag::kMinComputeCapability >= 80 && kIsHalf; + static constexpr bool kKeepOutputInRF = kSingleValueIteration; + static constexpr bool kNeedsOutputAccumulatorBuffer = !kKeepOutputInRF && + !cutlass::platform::is_same::value; + + static_assert(kQueriesPerBlock % 32 == 0, ""); + static_assert(kKeysPerBlock % 32 == 0, ""); + static constexpr int kNumWarpsPerBlock = + kQueriesPerBlock * kKeysPerBlock / (32 * 32); + static constexpr int kWarpSize = 32; + + // Launch bounds + static constexpr int kNumThreads = kWarpSize * kNumWarpsPerBlock; + static constexpr int kMinBlocksPerSm = + getWarpsPerSmFw() / kNumWarpsPerBlock; + + struct Params { + // Input tensors + const scalar_t* query_ptr = nullptr; // [num_queries, num_heads, head_dim] + const scalar_t* key_ptr = nullptr; // [num_keys, num_heads, head_dim] + const scalar_t* value_ptr = nullptr; // [num_keys, num_heads, head_dim_value] + const scalar_t* attn_bias_ptr = nullptr; // [num_heads, num_queries, num_keys] + const int32_t* seqstart_q_ptr = nullptr; + const int32_t* seqstart_k_ptr = nullptr; + + const int32_t* seqlen_k_ptr = nullptr; + uint32_t causal_diagonal_offset = 0; + + // Output tensors + output_t* output_ptr = nullptr; // [num_queries, num_heads, head_dim_value] + // [num_queries, num_heads, head_dim_value] + output_accum_t* output_accum_ptr = nullptr; + // [num_heads, num_queries] - can be null + lse_scalar_t* logsumexp_ptr = nullptr; + + // Sliding window. ignored if == 0 + int32_t window_size = 0; + + // Scale + accum_t scale = 0.0; + + // Dimensions/strides + int32_t head_dim = 0; + int32_t head_dim_value = 0; + int32_t num_queries = 0; + int32_t num_keys = 0; + int32_t num_keys_absolute = 0; + + uint8_t custom_mask_type = NoCustomMask; + + int32_t q_strideM = 0; + int32_t k_strideM = 0; + int32_t v_strideM = 0; + int32_t bias_strideM = 0; + + int32_t o_strideM = 0; + + // Everything below is only used in `advance_to_block` + // and shouldn't use registers + int32_t q_strideH = 0; + int32_t k_strideH = 0; + int32_t v_strideH = 0; + int64_t bias_strideH = 0; + + int64_t q_strideB = 0; + int64_t k_strideB = 0; + int64_t v_strideB = 0; + int64_t bias_strideB = 0; + + int32_t num_batches = 0; + int32_t num_heads = 0; + + // dropout + bool use_dropout = false; + unsigned long long dropout_batch_head_rng_offset = 0; + float dropout_prob = 0.0f; + at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0); + int64_t* extragraph_offset = nullptr; + int64_t* seed = nullptr; + + // Moves pointers to what we should process + // Returns "false" if there is no work to do + CUTLASS_DEVICE bool advance_to_block() { + auto batch_id = blockIdx.z; + auto head_id = blockIdx.y; + auto query_start = blockIdx.x * kQueriesPerBlock; + + auto lse_dim = ceil_div((int32_t)num_queries, kAlignLSE) * kAlignLSE; + + if (kSupportsDropout) { + dropout_batch_head_rng_offset = + batch_id * num_heads * num_queries * num_keys + + head_id * num_queries * num_keys; + } + + int64_t q_start = 0, k_start = 0; + // Advance to current batch - in case of different sequence lengths + if (seqstart_q_ptr != nullptr) { + assert(seqstart_k_ptr != nullptr); + seqstart_q_ptr += batch_id; + + q_start = seqstart_q_ptr[0]; + int64_t q_next_start = seqstart_q_ptr[1]; + int64_t k_end; + seqstart_k_ptr += batch_id; + + if (seqlen_k_ptr) { + k_start = seqstart_k_ptr[0]; + k_end = k_start + seqlen_k_ptr[batch_id]; + } else { + k_start = seqstart_k_ptr[0]; + k_end = seqstart_k_ptr[1]; + } + + num_queries = q_next_start - q_start; + num_keys = k_end - k_start; + + if (query_start >= num_queries) { + return false; + } + } else { + query_ptr += batch_id * q_strideB; + key_ptr += batch_id * k_strideB; + value_ptr += batch_id * v_strideB; + output_ptr += int64_t(batch_id * num_queries) * o_strideM; + if (output_accum_ptr != nullptr) { + output_accum_ptr += + int64_t(batch_id * num_queries) * (head_dim_value * num_heads); + } + q_start = 0; + k_start = 0; + } + + // Advance to the current batch / head / query_start + query_ptr += (q_start + query_start) * q_strideM + head_id * q_strideH; + key_ptr += k_start * k_strideM + head_id * k_strideH; + + value_ptr += k_start * v_strideM + head_id * v_strideH; + output_ptr += + int64_t(q_start + query_start) * o_strideM + head_id * head_dim_value; + + if (kSupportsBias && attn_bias_ptr != nullptr) { + attn_bias_ptr += (batch_id * bias_strideB) + (head_id * bias_strideH); + } + if (output_accum_ptr != nullptr) { + output_accum_ptr += + int64_t(q_start + query_start) * (head_dim_value * num_heads) + + head_id * head_dim_value; + } else { + // Accumulate directly in the destination buffer (eg for f32) + output_accum_ptr = (accum_t*)output_ptr; + } + + if (logsumexp_ptr != nullptr) { + // lse[batch_id, head_id, query_start] + logsumexp_ptr += + batch_id * lse_dim * num_heads + head_id * lse_dim + query_start; + } + + // Custom masking + if (custom_mask_type == CausalFromBottomRight) { + causal_diagonal_offset = num_keys - num_queries; + } + // We use num_keys_absolute to index into the rng_state + // We need this index to match between forward and backwards + num_keys_absolute = num_keys; + if (custom_mask_type == CausalFromTopLeft || + custom_mask_type == CausalFromBottomRight) { + // the bottom row of the current block is query_start + kQueriesPerBlock + // the last active key is then query_start + causal_diagonal_offset + + // kQueriesPerBlock so num_keys is the min between actual num_keys and + // this to avoid extra computations + num_keys = cutlass::fast_min( + int32_t(query_start + causal_diagonal_offset + kQueriesPerBlock), + num_keys); + } + + num_queries -= query_start; + num_batches = 0; // no longer used after + + // If num_queries == 1, and there is only one key head we're wasting + // 15/16th of tensor core compute In that case : + // - we only launch kernels for head_id % kQueriesPerBlock == 0 + // - we iterate over heads instead of queries (strideM = strideH) + if (num_queries == 1 && k_strideH == 0 && v_strideH == 0 && + logsumexp_ptr == nullptr && window_size == 0) { + if (head_id % kQueriesPerBlock != 0) { + return false; + } + q_strideM = q_strideH; + bias_strideM = bias_strideH; + num_queries = num_heads; + num_heads = 1; // unused but here for intent + // remove causal since n_query = 1 + // otherwise, offset would change with head ! + custom_mask_type = NoCustomMask; + o_strideM = head_dim_value; + } + + // Make sure the compiler knows these variables are the same on all + // the threads of the warp. + // Only worth doing if they could have been modified above. + query_ptr = warp_uniform(query_ptr); + key_ptr = warp_uniform(key_ptr); + value_ptr = warp_uniform(value_ptr); + if (kSupportsBias) { + attn_bias_ptr = warp_uniform(attn_bias_ptr); + } + output_ptr = warp_uniform(output_ptr); + output_accum_ptr = warp_uniform(output_accum_ptr); + logsumexp_ptr = warp_uniform(logsumexp_ptr); + num_queries = warp_uniform(num_queries); + num_keys = warp_uniform(num_keys); + num_heads = warp_uniform(num_heads); + o_strideM = warp_uniform(o_strideM); + custom_mask_type = warp_uniform(custom_mask_type); + return true; + } + + __host__ dim3 getBlocksGrid() const { + return dim3( + ceil_div(num_queries, (int32_t)kQueriesPerBlock), + num_heads, + num_batches); + } + + __host__ dim3 getThreadsGrid() const { + return dim3(kWarpSize, kNumWarpsPerBlock, 1); + } + }; + + struct MM0 { + /* + In this first matmul, we compute a block of `Q @ K.T`. + While the calculation result is still hot in registers, we update + `mi`, `m_prime`, `s_prime` in shared-memory, and then store this value + into a shared-memory ("AccumulatorSharedStorage") that is used later as + operand A for the second matmul (see MM1) + */ + using GemmType = DefaultGemmType; + + using OpClass = typename GemmType::OpClass; + using DefaultConfig = + typename cutlass::gemm::device::DefaultGemmConfiguration< + OpClass, + ArchTag, + scalar_t, + scalar_t, + scalar_t, // ElementC + accum_t // ElementAccumulator + >; + static constexpr int kAlignmentA = + kIsAligned ? DefaultConfig::kAlignmentA : GemmType::kMinimumAlignment; + static constexpr int kAlignmentB = + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment; + using ThreadblockShape = cutlass::gemm:: + GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using DefaultMma = typename cutlass::gemm::threadblock::FindDefaultMma< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + kAlignmentA, + scalar_t, // ElementB, + cutlass::layout::ColumnMajor, // LayoutB, + kAlignmentB, + accum_t, + cutlass::layout::RowMajor, // LayoutC, + OpClass, + ArchTag, // ArchTag + ThreadblockShape, // ThreadblockShape + WarpShape, // WarpShape + typename GemmType::InstructionShape, // InstructionShape + ArchTag::kMinComputeCapability >= 80 && kIsHalf + ? 4 + : DefaultConfig::kStages, + typename GemmType::Operator // Operator + >::DefaultMma; + using MmaCore = typename DefaultMma::MmaCore; + using IteratorA = typename DefaultMma::IteratorA; + using IteratorB = typename DefaultMma::IteratorB; + using DefaultThreadblockMma = typename DefaultMma::ThreadblockMma; + using Mma = typename cutlass::platform::conditional< + kSingleValueIteration, + typename MakeCustomMma::Mma, + DefaultThreadblockMma>::type; + using AccumLambdaIterator = typename DefaultMmaAccumLambdaIterator< + typename Mma::Operator::IteratorC, + accum_t, + kWarpSize>::Iterator; + static_assert( + MmaCore::WarpCount::kM * MmaCore::WarpCount::kN * + MmaCore::WarpCount::kK == + kNumWarpsPerBlock, + ""); + + // used for efficient load of bias tile Bij from global to shared memory + using BiasLoader = TileSmemLoader< + scalar_t, + cutlass::MatrixShape, + MmaCore::kThreads, + // input restriction: kv_len has to be a multiple of this value + 128 / cutlass::sizeof_bits::value>; + + // Epilogue to store to shared-memory in a format that we can use later for + // the second matmul + using B2bGemm = typename cutlass::gemm::threadblock::B2bGemm< + typename Mma::Operator::IteratorC, + typename Mma::Operator, + scalar_t, + WarpShape, + ThreadblockShape>; + using AccumulatorSharedStorage = typename B2bGemm::AccumulatorSharedStorage; + }; + + struct MM1 { + /** + Second matmul: perform `attn @ V` where `attn` is the attention (not + normalized) and stored in shared memory + */ + using GemmType = DefaultGemmType; + + using OpClass = typename GemmType::OpClass; + using DefaultConfig = + typename cutlass::gemm::device::DefaultGemmConfiguration< + OpClass, + ArchTag, + scalar_t, + scalar_t, + output_accum_t, // ElementC + accum_t // ElementAccumulator + >; + static constexpr int kAlignmentA = DefaultConfig::kAlignmentA; // from smem + static constexpr int kAlignmentB = + kIsAligned ? DefaultConfig::kAlignmentB : GemmType::kMinimumAlignment; + using ThreadblockShape = cutlass::gemm:: + GemmShape; + using WarpShape = cutlass::gemm::GemmShape<32, 32, GemmType::WarpK>; + using InstructionShape = typename GemmType::InstructionShape; + + using LayoutB = cutlass::layout::RowMajor; + using DefaultGemm = cutlass::gemm::kernel::DefaultGemm< + scalar_t, // ElementA, + cutlass::layout::RowMajor, // LayoutA, + kAlignmentA, + scalar_t, // ElementB, + LayoutB, // LayoutB, + kAlignmentB, + output_accum_t, + cutlass::layout::RowMajor, // LayoutC, + accum_t, + OpClass, + ArchTag, + ThreadblockShape, + WarpShape, + typename GemmType::InstructionShape, + typename DefaultConfig::EpilogueOutputOp, + void, // ThreadblockSwizzle - not used + ArchTag::kMinComputeCapability >= 80 && kIsHalf + ? 4 + : DefaultConfig::kStages, + false, // SplitKSerial + typename GemmType::Operator>; + + using WarpIteratorA = typename cutlass::gemm::threadblock:: + DefaultWarpIteratorAFromSharedMemory< + typename DefaultGemm::Mma::Policy::Operator::Shape, // WarpShape + typename DefaultGemm::Mma::Policy::Operator::InstructionShape, + typename DefaultGemm::Mma::Policy::Operator::IteratorA, + typename DefaultGemm::Mma::Policy>::WarpIterator; + using DefaultMmaFromSmem = + typename cutlass::gemm::threadblock::DefaultMmaFromSharedMemory< + typename DefaultGemm::Mma, + MM0::AccumulatorSharedStorage::Shape::kN, // kMaxK + WarpIteratorA, + false>; // kScaleOperandA + using Mma = typename DefaultMmaFromSmem::Mma; + using IteratorB = typename Mma::IteratorB; + using WarpCount = typename Mma::WarpCount; + static_assert( + WarpCount::kM * WarpCount::kN * WarpCount::kK == kNumWarpsPerBlock, + ""); + + using DefaultEpilogue = typename DefaultGemm::Epilogue; + using OutputTileIterator = + typename cutlass::epilogue::threadblock::PredicatedTileIterator< + typename DefaultEpilogue::OutputTileIterator::ThreadMap, + output_t>; + using OutputTileIteratorAccum = + typename cutlass::epilogue::threadblock::PredicatedTileIterator< + typename DefaultEpilogue::OutputTileIterator::ThreadMap, + output_accum_t>; + }; + + static constexpr int64_t kAlignmentQ = MM0::kAlignmentA; + static constexpr int64_t kAlignmentK = MM0::kAlignmentB; + static constexpr int64_t kAlignmentV = 1; + + // Shared storage - depends on kernel params + struct ScalingCoefs { + cutlass::Array m_prime; + cutlass::Array s_prime; + cutlass::Array mi; + cutlass::Array out_rescale; + cutlass::Array + addition_storage; + }; + + struct SharedStorageEpilogueAtEnd : ScalingCoefs { + struct SharedStorageAfterMM0 { + // Everything here might be overwritten during MM0 + union { + typename MM0::BiasLoader::SmemTile bias; + typename MM0::AccumulatorSharedStorage si; + }; + typename MM1::Mma::SharedStorage mm1; + }; + + union { + typename MM0::Mma::SharedStorage mm0; + SharedStorageAfterMM0 after_mm0; + typename MM1::DefaultEpilogue::SharedStorage epilogue; + }; + + CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& + epilogue_shared_storage() { + return epilogue; + } + }; + + struct SharedStorageEpilogueInLoop : ScalingCoefs { + struct SharedStorageAfterMM0 { + // Everything here might be overwritten during MM0 + union { + typename MM0::BiasLoader::SmemTile bias; + typename MM0::AccumulatorSharedStorage si; + }; + typename MM1::Mma::SharedStorage mm1; + typename MM1::DefaultEpilogue::SharedStorage epilogue; + }; + + union { + typename MM0::Mma::SharedStorage mm0; + SharedStorageAfterMM0 after_mm0; + }; + + CUTLASS_DEVICE typename MM1::DefaultEpilogue::SharedStorage& + epilogue_shared_storage() { + return after_mm0.epilogue; + } + }; + + using SharedStorage = typename cutlass::platform::conditional< + kSingleValueIteration || kKeepOutputInRF, + SharedStorageEpilogueAtEnd, + SharedStorageEpilogueInLoop>::type; + + static bool __host__ check_supported(Params const& p) { + CHECK_ALIGNED_PTR(p.query_ptr, kAlignmentQ); + CHECK_ALIGNED_PTR(p.key_ptr, kAlignmentK); + CHECK_ALIGNED_PTR(p.value_ptr, kAlignmentV); + if (kSupportsBias) { + CHECK_ALIGNED_PTR(p.attn_bias_ptr, kAlignmentQ); + TORCH_CHECK( + p.num_batches <= 1 || p.bias_strideB % kAlignmentQ == 0, + "attn_bias is not correctly aligned (strideB). ", + "attn_bias.stride( 0) = ", p.bias_strideB, ", and should be a " + "multiple of ", kAlignmentQ, "."); + TORCH_CHECK( + p.num_heads <= 1 || p.bias_strideH % kAlignmentQ == 0, + "attn_bias is not correctly aligned (strideH). " + "attn_bias.stride(1) = ", p.bias_strideH, ", and should be a " + "multiple of ", kAlignmentQ, "."); + TORCH_CHECK( + p.num_queries <= 1 || p.bias_strideM % kAlignmentQ == 0, + "attn_bias is not correctly aligned (strideM). " + "attn_bias.stride(2) = ", p.bias_strideM, ", and should be a " + "multiple of ", kAlignmentQ, "."); + } + TORCH_CHECK( + p.q_strideM % kAlignmentQ == 0, + "query is not correctly aligned (strideM)"); + TORCH_CHECK( + p.k_strideM % kAlignmentK == 0, + "key is not correctly aligned (strideM)"); + TORCH_CHECK( + p.v_strideM % kAlignmentV == 0, + "value is not correctly aligned (strideM)"); + TORCH_CHECK( + p.num_heads <= 1 || p.q_strideH % kAlignmentQ == 0, + "query is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.k_strideH % kAlignmentK == 0, + "key is not correctly aligned (strideH)"); + TORCH_CHECK( + p.num_heads <= 1 || p.v_strideH % kAlignmentV == 0, + "value is not correctly aligned (strideH)"); + TORCH_CHECK( + p.custom_mask_type < NumCustomMaskTypes, + "invalid value for `custom_mask_type`"); + if (p.window_size > 0) { + TORCH_CHECK( + p.custom_mask_type == CausalFromTopLeft || + p.custom_mask_type == CausalFromBottomRight, + "custom_mask_type not supported"); + } + return true; + } + + static void CUTLASS_DEVICE attention_kernel(Params& p) { + // In this block, we will only ever: + // - read query[query_start:query_end, :] + // - write to output[query_start:query_end, :] + + extern __shared__ char smem_buffer[]; + SharedStorage& shared_storage = *((SharedStorage*)smem_buffer); + auto& m_prime = shared_storage.m_prime; + auto& s_prime = shared_storage.s_prime; + auto& mi = shared_storage.mi; + auto& out_rescale = shared_storage.out_rescale; + const uint32_t query_start = blockIdx.x * kQueriesPerBlock; + + static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, ""); + if (thread_id() < kQueriesPerBlock) { + s_prime[thread_id()] = accum_t(0); + out_rescale[thread_id()] = accum_t(1.0); + m_prime[thread_id()] = + -cutlass::platform::numeric_limits::infinity(); + mi[thread_id()] = -cutlass::platform::numeric_limits::infinity(); + } + typename MM1::Mma::FragmentC accum_o; + accum_o.clear(); + + auto createOutputIter = [&](int col) -> typename MM1::OutputTileIterator { + using OutputTileIterator = typename MM1::OutputTileIterator; + return OutputTileIterator( + typename OutputTileIterator::Params{(int32_t)p.o_strideM}, + p.output_ptr, + typename OutputTileIterator::TensorCoord{ + p.num_queries, p.head_dim_value}, + thread_id(), + {0, col}); + }; + + auto createOutputAccumIter = [&](int col) -> + typename MM1::OutputTileIteratorAccum { + using OutputTileIteratorAccum = typename MM1::OutputTileIteratorAccum; + return OutputTileIteratorAccum( + typename OutputTileIteratorAccum::Params{ + (int32_t)(p.head_dim_value * p.num_heads)}, + p.output_accum_ptr, + typename OutputTileIteratorAccum::TensorCoord{ + p.num_queries, p.head_dim_value}, + thread_id(), + {0, col}); + }; + + curandStatePhilox4_32_10_t curand_state_init; + if (kSupportsDropout && p.use_dropout) { + const auto [seed, offset] = at::cuda::philox::unpack(p.rng_engine_inputs); + if (p.rng_engine_inputs.captured_) { + // See Note [Seed and Offset Device] + // When we are in cuda graph capture mode the seed and offset are stored + // on device We pass in int64_t* seed, and int64_t* offset to act as + // scratch space for storing the rng state during the forward pass and + // saving for backwards. + *p.seed = seed; + *p.extragraph_offset = offset; + } + // each element of the attention matrix P with shape + // (batch_sz, n_heads, n_queries, n_keys) is associated with a single + // offset in RNG sequence. we initialize the RNG state with offset that + // starts at the beginning of a (n_queries, n_keys) matrix for this + // block's batch_id and head_id + // initializing rng state is very expensive, so we run once per kernel, + // rather than once per iteration. each iteration takes a copy of the + // initialized RNG state and offsets it as needed. + curand_init( + seed, + 0, + offset + p.dropout_batch_head_rng_offset, + &curand_state_init); + } + + // Iterate through keys + for (int32_t iter_key_start = 0; iter_key_start < p.num_keys; + iter_key_start += kKeysPerBlock) { + if (p.window_size > 0) { + // don't compute anything if below attention band + if (iter_key_start + kKeysPerBlock < + int32_t(query_start + p.causal_diagonal_offset) - p.window_size) { + continue; + } + } + int32_t problem_size_0_m = + cutlass::fast_min((int32_t)kQueriesPerBlock, p.num_queries); + int32_t problem_size_0_n = cutlass::fast_min( + int32_t(kKeysPerBlock), p.num_keys - iter_key_start); + int32_t const& problem_size_0_k = p.head_dim; + int32_t const& problem_size_1_n = p.head_dim_value; + int32_t const& problem_size_1_k = problem_size_0_n; + + auto prologueV = [&](int blockN) { + typename MM1::Mma::IteratorB iterator_V( + typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)}, + const_cast(p.value_ptr + iter_key_start * p.v_strideM), + {problem_size_1_k, problem_size_1_n}, + thread_id(), + cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN}); + MM1::Mma::prologue( + shared_storage.after_mm0.mm1, + iterator_V, + thread_id(), + problem_size_1_k); + }; + + __syncthreads(); // Need to have shared memory initialized, and `m_prime` + // updated from end of prev iter + // + // MATMUL: Q.K_t + // + // Computes the block-matrix product of: + // (a) query[query_start:query_end, :] + // with + // (b) key[iter_key_start:iter_key_start + kKeysPerBlock] + // and stores that into `shared_storage.si` + // + + // Compute threadblock location + cutlass::gemm::GemmCoord tb_tile_offset = {0, 0, 0}; + + cutlass::MatrixCoord tb_offset_A{ + tb_tile_offset.m() * MM0::Mma::Shape::kM, tb_tile_offset.k()}; + + cutlass::MatrixCoord tb_offset_B{ + tb_tile_offset.k(), tb_tile_offset.n() * MM0::Mma::Shape::kN}; + + // Construct iterators to A and B operands + typename MM0::IteratorA iterator_A( + typename MM0::IteratorA::Params( + typename MM0::MmaCore::LayoutA(p.q_strideM)), + const_cast(p.query_ptr), + {problem_size_0_m, problem_size_0_k}, + thread_id(), + tb_offset_A); + + typename MM0::IteratorB iterator_B( + typename MM0::IteratorB::Params( + typename MM0::MmaCore::LayoutB(p.k_strideM)), + const_cast(p.key_ptr + iter_key_start * p.k_strideM), + {problem_size_0_k, problem_size_0_n}, + thread_id(), + tb_offset_B); + + auto my_warp_id = warp_uniform(warp_id()); + auto my_lane_id = lane_id(); + + // Construct thread-scoped matrix multiply + typename MM0::Mma mma( + shared_storage.mm0, thread_id(), my_warp_id, my_lane_id); + + typename MM0::Mma::FragmentC accum; + + accum.clear(); + + auto gemm_k_iterations = + (problem_size_0_k + MM0::Mma::Shape::kK - 1) / MM0::Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add + mma(gemm_k_iterations, accum, iterator_A, iterator_B, accum); + __syncthreads(); + + if (kPreloadV) { + prologueV(0); + } + + typename MM0::Mma::Operator::IteratorC::TensorCoord + iteratorC_tile_offset = { + (tb_tile_offset.m() * MM0::Mma::WarpCount::kM) + + (my_warp_id % MM0::Mma::WarpCount::kM), + (tb_tile_offset.n() * MM0::Mma::WarpCount::kN) + + (my_warp_id / MM0::Mma::WarpCount::kM)}; + + // multiply by scaling factor + if (kSupportsBias) { + accum = + cutlass::multiplies()(p.scale, accum); + } + + // apply attention bias if applicable + if (kSupportsBias && p.attn_bias_ptr != nullptr) { + // load bias tile Bij into shared memory + typename MM0::BiasLoader::GmemTileIterator bias_iter( + {cutlass::layout::RowMajor(p.bias_strideM)}, + // attn_bias_pointer points to matrix of size (n_queries, n_keys) + // for the relevant batch_id and head_id + const_cast(p.attn_bias_ptr + query_start * p.bias_strideM + iter_key_start), + {problem_size_0_m, problem_size_0_n}, + thread_id()); + cutlass::TensorRef bias_tensor_ref( + shared_storage.after_mm0.bias.data(), + cutlass::layout::RowMajor(MM0::ThreadblockShape::kN)); + typename MM0::BiasLoader::SmemTileIterator smem_tile_iter( + bias_tensor_ref, thread_id()); + MM0::BiasLoader::load(bias_iter, smem_tile_iter); + + // Pij += Bij, Pij is in register fragment and Bij is in shared memory + auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset( + my_lane_id, my_warp_id, iteratorC_tile_offset); + MM0::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) {}, + [&](int accum_m, int accum_n, int idx) { + if (accum_m < problem_size_0_m && accum_n < problem_size_0_n) { + accum[idx] += bias_tensor_ref.at({accum_m, accum_n}); + } + }, + [&](int accum_m) {}); + } + + // Mask out last if causal + // This is only needed if upper-right corner of current query / key block + // intersects the mask Coordinates of upper-right corner of current block + // is y=query_start x=min(iter_key_start + kKeysPerBlock, num_keys)) The + // first masked element is x = y + offset -> query_start + offset There is + // intersection (and we need to mask) if min(iter_key_start + + // kKeysPerBlock, num_keys)) >= query_start + offset + if (p.custom_mask_type && + cutlass::fast_min(iter_key_start + kKeysPerBlock, p.num_keys) >= + (query_start + p.causal_diagonal_offset)) { + auto query_start = blockIdx.x * kQueriesPerBlock; + auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset( + my_lane_id, my_warp_id, iteratorC_tile_offset); + int32_t last_col; + MM0::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + // last absolute col is (last absolute query + offset) + // last local col is (last absolute query + offset - + // iter_key_start) + last_col = query_start + accum_m + p.causal_diagonal_offset - + iter_key_start; + }, + [&](int accum_m, int accum_n, int idx) { + if (accum_n > last_col) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + } + + // Mask out lower left corner of block if window_size > 0 + // only required if current block intersects with the lower left corner + // block starts at x_lowerleft = iter_key_start // y = query_start + + // kQueriesPerBlock first non masked value at this y is : x_first = + // query_start + kQueriesPerBlock - window_size mask if x_fist > + // x_lowerleft + + if (p.window_size > 0 && + (query_start + p.causal_diagonal_offset + + cutlass::fast_min( + int32_t(kQueriesPerBlock), int32_t(p.num_queries)) - + p.window_size >= + iter_key_start)) { + auto query_start = blockIdx.x * kQueriesPerBlock; + auto lane_offset = MM0::AccumLambdaIterator::get_lane_offset( + my_lane_id, my_warp_id, iteratorC_tile_offset); + int32_t first_col; + const int32_t offset = query_start + p.causal_diagonal_offset - + p.window_size - iter_key_start; + MM0::AccumLambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { first_col = accum_m + offset; }, + [&](int accum_m, int accum_n, int idx) { + if (accum_n <= first_col) { + accum[idx] = + -cutlass::platform::numeric_limits::infinity(); + } + }, + [&](int accum_m) {}); + // print_warp_accum(accum, lane_offset, 12, + // 12); + } + + // Update `mi` from accum stored in registers + // Also does accum[i] <- exp(accum[i] - mi) + iterative_softmax( + accum_o, + accum, + mi, + m_prime, + s_prime, + out_rescale, + shared_storage.addition_storage, + my_lane_id, + thread_id(), + my_warp_id, + p.num_keys - iter_key_start, + iter_key_start == 0, + iteratorC_tile_offset, + kSupportsBias ? 1.0f : p.scale); + + // Output results to shared-memory + int warp_idx_mn_0 = my_warp_id % + (MM0::Mma::Base::WarpCount::kM * MM0::Mma::Base::WarpCount::kN); + auto output_tile_coords = cutlass::MatrixCoord{ + warp_idx_mn_0 % MM0::Mma::Base::WarpCount::kM, + warp_idx_mn_0 / MM0::Mma::Base::WarpCount::kM}; + + MM0::B2bGemm::accumToSmem( + shared_storage.after_mm0.si, accum, my_lane_id, output_tile_coords); + + __syncthreads(); + + // apply dropout (if applicable) after we've written Pij to smem. + // dropout is applied by multiplying each element of Pij by: + // - 0 with probability dropout_p + // - 1 / (1 - dropout_p) with probability 1 - dropout_p + // + // for backward purposes we want to be able to map each element of the + // attention matrix to the same random uniform number as the one we used + // in forward, without needing to use the same iteration order or having + // to store the dropout matrix. its possible to do this in registers but + // it ends up being very slow because each thread having noncontiguous + // strips of the Pij tile means we have to skip around a lot, and also + // have to generate a single random number at a time + if (kSupportsDropout && p.use_dropout) { + auto si = shared_storage.after_mm0.si.accum_ref(); + // each thread handles a contiguous sequence of elements from Sij, all + // coming from the same row. the reason they have to come from the same + // row is that the sampling random numbers from a contiguous random + // number sequence is much more efficient than jumping around, and the + // linear offset of each element of S (the global matrix) maps to an + // offset in a random number sequence. for S, the end of a row and the + // beginning of the next have adjacent offsets, but for Sij, this is not + // necessarily the case. + const int num_threads = blockDim.x * blockDim.y * blockDim.z; + const int threads_per_row = + cutlass::fast_min(num_threads / problem_size_0_m, problem_size_0_n); + const int elts_per_thread = cutlass::round_nearest( + cutlass::ceil_div(problem_size_0_n, threads_per_row), 4); + + const int thread_i = thread_id() / threads_per_row; + const int thread_start_j = + (thread_id() % threads_per_row) * elts_per_thread; + + if (thread_i < problem_size_0_m && thread_start_j < problem_size_0_n) { + curandStatePhilox4_32_10_t curand_state = curand_state_init; + skipahead( + static_cast( + (query_start + thread_i) * p.num_keys_absolute + + (iter_key_start + thread_start_j)), + &curand_state); + const float dropout_scale = 1.0 / (1.0 - p.dropout_prob); + + // apply dropout scaling to elements this thread is responsible for, + // in chunks of 4 + for (int sij_start_col_idx = thread_start_j; sij_start_col_idx < + cutlass::fast_min(thread_start_j + elts_per_thread, + problem_size_0_n); + sij_start_col_idx += 4) { + const float4 rand_uniform_quad = curand_uniform4(&curand_state); + + CUTLASS_PRAGMA_UNROLL + for (int quad_idx = 0; quad_idx < 4; ++quad_idx) { + si.at({thread_i, sij_start_col_idx + quad_idx}) *= + static_cast( + dropout_scale * + ((&rand_uniform_quad.x)[quad_idx] > p.dropout_prob)); + } + } + } + __syncthreads(); // p.use_dropout should have same value kernel-wide + } + + // + // MATMUL: Attn . V + // Run the matmul `attn @ V` for a block of attn and V. + // `attn` is read from shared memory (in `shared_storage_si`) + // `V` is read from global memory (with iterator_B) + // + + const int64_t nBlockN = kSingleValueIteration + ? 1 + : ceil_div( + (int64_t)problem_size_1_n, int64_t(MM1::ThreadblockShape::kN)); + for (int blockN = 0; blockN < nBlockN; ++blockN) { + int gemm_k_iterations = + (problem_size_1_k + MM1::Mma::Shape::kK - 1) / MM1::Mma::Shape::kK; + + // Compute threadblock-scoped matrix multiply-add and store it in accum + // (in registers) + if (!kPreloadV) { + __syncthreads(); // we share shmem between mma and epilogue + } + + typename MM1::Mma::IteratorB iterator_V( + typename MM1::IteratorB::Params{MM1::LayoutB(p.v_strideM)}, + const_cast(p.value_ptr + iter_key_start * p.v_strideM), + {problem_size_1_k, problem_size_1_n}, + thread_id(), + cutlass::MatrixCoord{0, blockN * MM1::Mma::Shape::kN}); + typename MM1::Mma mma_pv( + // operand A: Pij_dropped in shared memory + shared_storage.after_mm0.si.accum_ref(), + // operand B: shared memory staging area for Vj, which is loaded + // from global memory + shared_storage.after_mm0.mm1.operand_B_ref(), + (int)thread_id(), + (int)my_warp_id, + (int)my_lane_id); + mma_pv.set_prologue_done(kPreloadV); + if (!kKeepOutputInRF) { + accum_o.clear(); + } + mma_pv(gemm_k_iterations, accum_o, iterator_V, accum_o); + __syncthreads(); + + if (kPreloadV && !kSingleValueIteration && blockN + 1 < nBlockN) { + prologueV(blockN + 1); + } + + if (!kKeepOutputInRF) { + int first_key = 0; + if (p.window_size > 0) { + first_key = (cutlass::fast_max( + int(query_start + p.causal_diagonal_offset) - + p.window_size + 1, + 0) / + kKeysPerBlock) * + kKeysPerBlock; + } + + // int first_key_block = 0; + // MM1::Mma::drain_cp_asyncs(); # TODO figure out if this is needed for correctness + DISPATCH_BOOL( + iter_key_start == first_key, kIsFirst, ([&] { + DISPATCH_BOOL( + (iter_key_start + kKeysPerBlock) >= p.num_keys, + kIsLast, + ([&] { + using DefaultEpilogue = typename MM1::DefaultEpilogue; + using DefaultOp = + typename MM1::DefaultConfig::EpilogueOutputOp; + using ElementCompute = typename DefaultOp::ElementCompute; + using EpilogueOutputOp = typename cutlass::epilogue:: + thread::MemoryEfficientAttentionNormalize< + typename cutlass::platform::conditional< + kIsLast, + output_t, + output_accum_t>::type, + output_accum_t, + DefaultOp::kCount, + typename DefaultOp::ElementAccumulator, + ElementCompute, + kIsFirst, + kIsLast, + cutlass::Array>; + using Epilogue = typename cutlass::epilogue::threadblock:: + EpiloguePipelined< + typename DefaultEpilogue::Shape, + typename MM1::Mma::Operator, + DefaultEpilogue::kPartitionsK, + typename cutlass::platform::conditional< + kIsLast, + typename MM1::OutputTileIterator, + typename MM1::OutputTileIteratorAccum>::type, + typename DefaultEpilogue:: + AccumulatorFragmentIterator, + typename DefaultEpilogue::WarpTileIterator, + typename DefaultEpilogue::SharedLoadIterator, + EpilogueOutputOp, + typename DefaultEpilogue::Padding, + DefaultEpilogue::kFragmentsPerIteration, + true, // IterationsUnroll + typename MM1::OutputTileIteratorAccum // Read + // iterator + >; + + int col = blockN * MM1::Mma::Shape::kN; + auto source_iter = createOutputAccumIter(col); + auto dest_iter = call_conditional< + kIsLast, + decltype(createOutputIter), + decltype(createOutputAccumIter)>:: + apply(createOutputIter, createOutputAccumIter, col); + EpilogueOutputOp rescale(s_prime, out_rescale); + Epilogue epilogue( + shared_storage.epilogue_shared_storage(), + thread_id(), + my_warp_id, + my_lane_id); + epilogue(rescale, dest_iter, accum_o, source_iter); + })); + })); + if (!kSingleValueIteration) { + __syncthreads(); + } + } + } + __syncthreads(); // we modify `m_prime` after + } + + if (kKeepOutputInRF) { + constexpr bool kIsFirst = true; + constexpr bool kIsLast = true; + using DefaultEpilogue = typename MM1::DefaultEpilogue; + using DefaultOp = typename MM1::DefaultConfig::EpilogueOutputOp; + using ElementCompute = typename DefaultOp::ElementCompute; + using EpilogueOutputOp = + typename cutlass::epilogue::thread::MemoryEfficientAttentionNormalize< + output_t, // output + output_accum_t, // source + DefaultOp::kCount, + typename DefaultOp::ElementAccumulator, // accum + output_accum_t, // compute + kIsFirst, + kIsLast, + cutlass::Array>; + using Epilogue = + typename cutlass::epilogue::threadblock::EpiloguePipelined< + typename DefaultEpilogue::Shape, + typename MM1::Mma::Operator, + DefaultEpilogue::kPartitionsK, + typename MM1::OutputTileIterator, // destination + typename DefaultEpilogue::AccumulatorFragmentIterator, + typename DefaultEpilogue::WarpTileIterator, + typename DefaultEpilogue::SharedLoadIterator, + EpilogueOutputOp, + typename DefaultEpilogue::Padding, + DefaultEpilogue::kFragmentsPerIteration, + true, // IterationsUnroll + typename MM1::OutputTileIteratorAccum // source tile + >; + auto dest_iter = createOutputIter(0); + EpilogueOutputOp rescale(s_prime, out_rescale); + Epilogue epilogue( + shared_storage.epilogue_shared_storage(), + thread_id(), + warp_id(), + lane_id()); + epilogue(rescale, dest_iter, accum_o); + } + + // 7. Calculate logsumexp + // To make the backward easier, we pad logsumexp with `inf` + // this avoids a few bound checks, and is not more expensive during fwd + static_assert(kQueriesPerBlock < kNumWarpsPerBlock * kWarpSize, ""); + if (p.logsumexp_ptr && thread_id() < kQueriesPerBlock) { + auto lse_dim = ceil_div((int32_t)p.num_queries, kAlignLSE) * kAlignLSE; + constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E + if (thread_id() < p.num_queries) { + // We set fully masked out rows to 0, the sumexp for masked out rows will be 0 + // We update it to be 1 prior to calling log so that log(1) = 0 + s_prime[thread_id()] = (s_prime[thread_id()] == 0) ? 1: s_prime[thread_id()]; + mi[thread_id()] = (mi[thread_id()] == -cutlass::platform::numeric_limits::infinity()) ? 0: mi[thread_id()]; + p.logsumexp_ptr[thread_id()] = accum_t(mi[thread_id()] / kLog2e) + + cutlass::fast_log(accum_t(s_prime[thread_id()])); + } else if (thread_id() < lse_dim) { + p.logsumexp_ptr[thread_id()] = + cutlass::platform::numeric_limits::infinity(); + } + } + } + + template + CUTLASS_DEVICE static void iterative_softmax( + typename WarpIteratorC::Fragment& frag_o, // output so far + typename WarpIteratorC::Fragment& frag, + cutlass::Array& mi, + cutlass::Array& m_prime, + cutlass::Array& s_prime, + cutlass::Array& out_rescale, + cutlass::Array& + addition_storage, + int8_t lane_id, + int8_t thread_id, + int8_t warp_id, + int max_col, + bool is_first, + typename WarpIteratorC::TensorCoord const& tile_offset, + float scaling) { + /* Iterates on the accumulator and corresponding position on result matrix + + (1) Update `mi[r]` to the max value of the row `r` + (2) In a second iteration do the following: + (a) accum <- exp(accum - mi) + (b) m_prime <- exp(m_prime - mi) + (c) s_prime <- s_prime * m_prime + sum(accum) + + All of this is done on registers, before we store all of this + on shared memory for the next matmul with Value. + */ + using Fragment = typename WarpIteratorC::Fragment; + using LambdaIterator = typename DefaultMmaAccumLambdaIterator< + WarpIteratorC, + accum_t, + kWarpSize>::Iterator; + // Convert to `accum_t` (rather than double) + constexpr float kLog2e = 1.4426950408889634074; // log_2(e) = M_LOG2E + + static_assert(kQueriesPerBlock % kNumWarpsPerBlock == 0, ""); + static constexpr int kLinesPerWarp = kQueriesPerBlock / kNumWarpsPerBlock; + + frag = cutlass::multiplies()(scaling * kLog2e, frag); + + auto lane_offset = + LambdaIterator::get_lane_offset(lane_id, warp_id, tile_offset); + + // First update `mi` to the max per-row + { + accum_t max; + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { + max = -cutlass::platform::numeric_limits::infinity(); + }, + [&](int accum_m, int accum_n, int idx) { + if (accum_n < max_col) { + max = cutlass::fast_max(max, frag[idx]); + } + }, + [&](int accum_m) { + // Having 4x atomicMax seems faster than reduce within warp + // first... + atomicMaxFloat(&mi[accum_m], max); + }); + } + + // Make sure we all share the update values for `mi` + __syncthreads(); + + // Doing this `exp` is quite expensive. Let's + // split it across the warps + bool restore_mi_to_minus_inf = false; + if (lane_id < kLinesPerWarp) { + int id = warp_id * kLinesPerWarp + lane_id; + auto m_prime_id = m_prime[id]; + auto mi_id = mi[id]; + bool changed = m_prime_id < mi_id; // `false` if both are -inf + if (changed) { + auto m_prime_exp = exp2f(m_prime_id - mi_id); + out_rescale[id] = m_prime_exp; + s_prime[id] *= m_prime_exp; + } else { + // Only when bias is enabled, it's possible that all the first values + // of attention are masked to `-inf`. In that case we want to avoid + // `nan = exp2f(-inf - (-inf))` so we temporarily set `mi` to 0 + if (kSupportsBias && + mi_id == -cutlass::platform::numeric_limits::infinity()) { + restore_mi_to_minus_inf = true; + mi[id] = 0.0f; + } + out_rescale[id] = 1.0f; + } + } + __syncthreads(); // Update output fragments + if (kKeepOutputInRF && !is_first) { + accum_t line_rescale; + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { line_rescale = out_rescale[accum_m]; }, + [&](int accum_m, int accum_n, int idx) { + frag_o[idx] = frag_o[idx] * line_rescale; + }, + [&](int accum_m) {}); + } + // Update accum_m, accum_n, ... + { + accum_t mi_row, total_row; + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { mi_row = mi[accum_m]; }, + [&](int accum_m, int accum_n, int idx) { + frag[idx] = + (accum_n < max_col) ? exp2f(frag[idx] - mi_row) : accum_t(0.0); + }, + [&](int accum_m) {}); + LambdaIterator::iterateRows( + lane_offset, + [&](int accum_m) { total_row = 0.0; }, + [&](int accum_m, int accum_n, int idx) { total_row += frag[idx]; }, + [&](int accum_m) { + if (LambdaIterator::reduceSameRow( + lane_id, total_row, [](accum_t a, accum_t b) { + return a + b; + })) { + // NOTE: we could atomically add `total_row` to `s_prime`, but + // it's faster (and deterministic) to avoid atomics here + addition_storage + [accum_m + kQueriesPerBlock * tile_offset.column()] = + total_row; + } + }); + } + __syncthreads(); + if (lane_id < kLinesPerWarp) { + int id = warp_id * kLinesPerWarp + lane_id; + accum_t total_row = s_prime[id]; + if (restore_mi_to_minus_inf) { + // Restore `mi`, see above when we set `restore_mi_to_minus_inf=true` + mi[id] = -cutlass::platform::numeric_limits::infinity(); + } else { + m_prime[id] = mi[id]; + } + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < MM0::MmaCore::WarpCount::kN; ++i) { + total_row += addition_storage[id + kQueriesPerBlock * i]; + } + s_prime[id] = total_row; + } + } + + static CUTLASS_DEVICE int8_t lane_id() { + return threadIdx.x; + } + static CUTLASS_DEVICE int8_t warp_id() { + return threadIdx.y; + } + static CUTLASS_DEVICE int16_t thread_id() { + return threadIdx.x + threadIdx.y * blockDim.x; + } +}; + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_batched_impl(typename AK::Params p) { + if (!p.advance_to_block()) { + return; + } + AK::attention_kernel(p); +} + +template +__global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) + attention_kernel_batched(typename AK::Params params); + +} // namespace PyTorchMemEffAttention + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h new file mode 100644 index 0000000000000000000000000000000000000000..4bcfdfae1ad9ce457ec724fce972654d6091eb00 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassB.h @@ -0,0 +1,919 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +// This file is auto-generated. See "generate_kernels.py" +#pragma once +#include +using namespace PyTorchMemEffAttention; +// ======== f16 / sm70 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm70(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k128_seqaligned_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm70); +} + +// ======== bf16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_128x64_k96_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_128x128_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_128x64_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_bf16_sm80(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k32_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k64_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k64_sm80); + if (cc == 86 || cc == 89) cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_128x64_k96_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_128x128_k128_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_128x128_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k128_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_128x64_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_128x128_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_128x64_k65536_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_bf16_aligned_64x64_k65536_dropout_sm80); +} + +// ======== f16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k96_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x128_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm80(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_sm80); + if (cc == 86 || cc == 89) cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k96_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x128_k128_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x128_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_seqaligned_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x128_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm80); +} + +// ======== f16 / sm50 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm50(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm50); +} + +// ======== f32 / sm50 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_sm50(typename AttentionBackwardKernel::Params p); +#if defined(CUDA_VERSION) && CUDA_VERSION == 12040 && !defined(USE_ROCM) +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_32x32_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_32x32_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +#else +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +#endif +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k32_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k64_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k128_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k65536_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm50(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_sm50); +#if defined(CUDA_VERSION) && CUDA_VERSION == 12040 && !defined(USE_ROCM) + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_32x32_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_32x32_k64_dropout_sm50); +#else + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm50); +#endif + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k32_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k64_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k128_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k65536_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm50); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm50); +} + +// ======== f32 / sm70 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k32_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k64_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k128_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k65536_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm70(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k32_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k64_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k128_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k65536_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm70); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm70); +} + +// ======== f16 / sm75 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f16_sm75(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_128x64_k65536_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_aligned_64x64_k65536_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_128x64_k65536_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f16_notaligned_64x64_k65536_dropout_sm75); +} + +// ======== f32 / sm75 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k32_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k64_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k128_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k65536_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm75(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k32_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k64_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k128_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k65536_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k32_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k64_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k128_dropout_sm75); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_notaligned_64x64_k65536_dropout_sm75); +} + +// ======== f32 / sm80 ======== +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_128x64_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_128x64_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); +__global__ void __launch_bounds__( + AttentionBackwardKernel::kNumThreads, + AttentionBackwardKernel::kMinBlocksPerSm) +fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80(typename AttentionBackwardKernel::Params p); + +template void dispatch_cutlassB_f32_sm80(T cb, int cc) { + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_128x64_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_128x64_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k32_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k64_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_128x64_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k128_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_128x64_k65536_dropout_sm80); + cb(AttentionBackwardKernel(), fmha_cutlassB_f32_aligned_64x64_k65536_dropout_sm80); +} + + +template +void dispatch_cutlassB(T cb, int cc = 0) { + + if (std::is_same_v && 70 <= cc && cc < 75) { + dispatch_cutlassB_f16_sm70(cb, cc); + } + if (std::is_same_v && 80 <= cc && cc <= 120) { + dispatch_cutlassB_bf16_sm80(cb, cc); + } + if (std::is_same_v && 80 <= cc && cc <= 120) { + dispatch_cutlassB_f16_sm80(cb, cc); + } + if (std::is_same_v && 50 <= cc && cc < 70) { + dispatch_cutlassB_f16_sm50(cb, cc); + } + if (std::is_same_v && 50 <= cc && cc < 70) { + dispatch_cutlassB_f32_sm50(cb, cc); + } + if (std::is_same_v && 70 <= cc && cc < 75) { + dispatch_cutlassB_f32_sm70(cb, cc); + } + if (std::is_same_v && 75 <= cc && cc < 80) { + dispatch_cutlassB_f16_sm75(cb, cc); + } + if (std::is_same_v && 75 <= cc && cc < 80) { + dispatch_cutlassB_f32_sm75(cb, cc); + } + if (std::is_same_v && 80 <= cc && cc <= 120) { + dispatch_cutlassB_f32_sm80(cb, cc); + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h new file mode 100644 index 0000000000000000000000000000000000000000..68a4dcad6bb7d015435df371b011c43469ddfb01 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/kernels/cutlassF.h @@ -0,0 +1,318 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +// This file is auto-generated. See "generate_kernels.py" +#pragma once +#include +using namespace PyTorchMemEffAttention; +// ======== bf16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_64x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_bf16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_bf16_sm80(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_64x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_bf16_aligned_32x128_gmem_sm80); +} + +// ======== f16 / sm50 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm50(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm50(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm50); +} + +// ======== f16 / sm70 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm70(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm70(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm70); +} + +// ======== f16 / sm75 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_notaligned_32x128_gmem_sm75(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm75(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f16_notaligned_32x128_gmem_sm75); +} + +// ======== f16 / sm80 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_64x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f16_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f16_sm80(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_64x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f16_aligned_32x128_gmem_sm80); +} + +// ======== f32 / sm50 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm50(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm50(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm50(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm50); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm50); +} + +// ======== f32 / sm70 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm70(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm70(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm70(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm70); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm70); +} + +// ======== f32 / sm75 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_64x64_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_rf_sm75(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_notaligned_32x128_gmem_sm75(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm75(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_64x64_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_rf_sm75); + cb(AttentionKernel(), fmha_cutlassF_f32_notaligned_32x128_gmem_sm75); +} + +// ======== f32 / sm80 ======== +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x64_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_64x128_rf_sm80(typename AttentionKernel::Params p); +__global__ void __launch_bounds__( + AttentionKernel::kNumThreads, + AttentionKernel::kMinBlocksPerSm) +fmha_cutlassF_f32_aligned_32x128_gmem_sm80(typename AttentionKernel::Params p); + +template void dispatch_cutlassF_f32_sm80(T cb, int cc) { + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x64_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_64x128_rf_sm80); + cb(AttentionKernel(), fmha_cutlassF_f32_aligned_32x128_gmem_sm80); +} + + +template +void dispatch_cutlassF(T cb, int cc = 0) { + + if (std::is_same_v && 80 <= cc && cc <= 120) { + dispatch_cutlassF_bf16_sm80(cb, cc); + } + if (std::is_same_v && 50 <= cc && cc < 70) { + dispatch_cutlassF_f16_sm50(cb, cc); + } + if (std::is_same_v && 70 <= cc && cc < 75) { + dispatch_cutlassF_f16_sm70(cb, cc); + } + if (std::is_same_v && 75 <= cc && cc < 80) { + dispatch_cutlassF_f16_sm75(cb, cc); + } + if (std::is_same_v && 80 <= cc && cc <= 120) { + dispatch_cutlassF_f16_sm80(cb, cc); + } + if (std::is_same_v && 50 <= cc && cc < 70) { + dispatch_cutlassF_f32_sm50(cb, cc); + } + if (std::is_same_v && 70 <= cc && cc < 75) { + dispatch_cutlassF_f32_sm70(cb, cc); + } + if (std::is_same_v && 75 <= cc && cc < 80) { + dispatch_cutlassF_f32_sm75(cb, cc); + } + if (std::is_same_v && 80 <= cc && cc <= 120) { + dispatch_cutlassF_f32_sm80(cb, cc); + } +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..0bc1d1b36a1014d4dc8db3c777189f342ffe54fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/pytorch_utils.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include + +#include +#include + + +template +struct CutlassToAtenDtype; + +template <> +struct CutlassToAtenDtype { + using scalar_t = cutlass::half_t; + + static constexpr __host__ at::ScalarType atScalarType() { + return at::ScalarType::Half; + } +}; + +template <> +struct CutlassToAtenDtype { + using scalar_t = cutlass::bfloat16_t; + + static constexpr __host__ at::ScalarType atScalarType() { + return at::ScalarType::BFloat16; + } +}; + +template <> +struct CutlassToAtenDtype { + using scalar_t = float; + + static constexpr __host__ at::ScalarType atScalarType() { + return at::ScalarType::Float; + } +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/transform/tile_smem_loader.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/transform/tile_smem_loader.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ea60fe3b642bd55892e4998b03fcc1f8492200 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/mem_eff_attention/transform/tile_smem_loader.h @@ -0,0 +1,71 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +template < + typename scalar_t, // scalar type + typename ThreadblockTileShape, // size of tile to load + int Threads, // number of participating threads + int ElementsPerAccess> // thread access width in elements +class TileSmemLoader { + public: + using SmemTile = + cutlass::AlignedBuffer; + + using ThreadMap = cutlass::transform::PitchLinearStripminedThreadMap< + cutlass::layout::PitchLinearShape< + ThreadblockTileShape::kColumn, // contiguous + ThreadblockTileShape::kRow>, // strided + Threads, // Threads + ElementsPerAccess>; // ElementsPerAccess + + using GmemTileIterator = + cutlass::transform::threadblock::PredicatedTileIterator< + ThreadblockTileShape, // Shape + scalar_t, // Element + cutlass::layout::RowMajor, // Layout + 0, // AdvanceRank + ThreadMap>; // ThreadMap + + using SmemTileIterator = cutlass::transform::threadblock::RegularTileIterator< + ThreadblockTileShape, // Shape + scalar_t, // Element + cutlass::layout::RowMajor, // Layout + 0, // AdvanceRank + ThreadMap>; // ThreadMap + + using Fragment = typename GmemTileIterator::Fragment; + + /// load a tile from global memory into shared memory + CUTLASS_DEVICE + static void load( + GmemTileIterator tile_load_iter, + SmemTileIterator tile_store_iter) { + Fragment tb_frag; + tb_frag.clear(); + tile_load_iter.load(tb_frag); + tile_store_iter.store(tb_frag); + + __syncthreads(); + } +}; + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/sdp_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/sdp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..29248fe216b89a88359e6386cdc46caf1d3b7a33 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/cuda/sdp_utils.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include + +namespace sdp { + +bool check_for_seq_len_1_nested_tensor(sdp_params const& params, bool debug); +SDPBackend select_sdp_backend(sdp_params const& kernel_params); +C10_EXPORT bool is_flash_attention_available(); +C10_EXPORT bool can_use_flash_attention(sdp_params const& params, bool debug); +C10_EXPORT bool can_use_mem_efficient_attention(sdp_params const& params, bool debug); +C10_EXPORT bool can_use_cudnn_attention(sdp_params const& params, bool debug); + +} // namespace sdp + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_adapter.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_adapter.h new file mode 100644 index 0000000000000000000000000000000000000000..ea7959d0ff81a48be7e71c1878fe9200ca7506df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_adapter.h @@ -0,0 +1,190 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_ROCM + +// Expect to be included after headers of at::zeros_like and at::empty_like + +#include +#include +#include +#include + +//////////////////////////////////////////////////////////////////////////////// +// Common macros copied from cuda/mem_eff_attention/gemm_kernel_utils.h +//////////////////////////////////////////////////////////////////////////////// + +namespace sdp { + +namespace aotriton_adapter { + +inline aotriton::DType cast_dtype(caffe2::TypeMeta t_dtype) +{ +#define CAST_TYPE(aname, dtname) if (t_dtype == at::aname) return aotriton::DType::dtname + CAST_TYPE(kByte, kUInt8); + CAST_TYPE(kUInt16, kUInt16); + CAST_TYPE(kUInt32, kUInt32); + CAST_TYPE(kUInt64, kUInt64); + CAST_TYPE(kChar, kInt8); + CAST_TYPE(kShort, kInt16); + CAST_TYPE(kInt, kInt32); + CAST_TYPE(kLong, kInt64); + CAST_TYPE(kHalf, kFloat16); + CAST_TYPE(kFloat, kFloat32); + CAST_TYPE(kBFloat16, kBFloat16); + return aotriton::DType::kUnknown; +#undef CAST_TYPE +} + +template +struct IntArrayRefCaster { + // std::array cast(IntArrayRef); +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ static_cast(ref.at(0)) }}; + } +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ + static_cast(ref.at(0)), + static_cast(ref.at(1)) + }}; + } +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ + static_cast(ref.at(0)), + static_cast(ref.at(1)), + static_cast(ref.at(2)) + }}; + } +}; + +template +struct IntArrayRefCaster { + static auto cast(at::IntArrayRef ref) { + return std::array{{ + static_cast(ref.at(0)), + static_cast(ref.at(1)), + static_cast(ref.at(2)), + static_cast(ref.at(3)) + }}; + } +}; + + +template +aotriton::TensorView mk_aotensor(const at::Tensor& q, std::string_view tensor_name) +{ + const auto strides = q.strides(); + int real_rank = strides.size(); + if (real_rank != Rank) { // Lazy conversion of tensor_name + TORCH_CHECK(false, + std::string(tensor_name) + "'s rank should be " + std::to_string(Rank) + + " but is " + std::to_string(real_rank)); + } + return aotriton::TensorView(reinterpret_cast(q.data_ptr()), + IntArrayRefCaster::cast(q.sizes()), + IntArrayRefCaster::cast(strides), + cast_dtype(q.dtype())); +} + +inline aotriton::TensorView<0> mk_aoscalartensor(const at::Tensor& q) +{ + return aotriton::TensorView<0>(reinterpret_cast(q.data_ptr()), + cast_dtype(q.dtype())); +} + +inline aotriton::TensorView<0> mk_philoxtensor(const int64_t* ptr) +{ + return aotriton::TensorView<0>(reinterpret_cast(ptr), + aotriton::DType::kUInt64); // AOTriton accepts unsigned int64 +} + +inline aotriton::TensorView<0> mk_atomictensor(const int32_t* ptr) +{ + return aotriton::TensorView<0>(reinterpret_cast(ptr), + aotriton::DType::kInt32); +} + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) + +struct LazyTensorContext { + at::Tensor like_tensor; + std::string_view tensor_name; + at::Tensor tensor; +}; + +template +struct LazyTensorFunctions : public LazyTensorContext { + static aotriton::TensorView acquire(void* cookie) { + auto ctx = (LazyTensorContext*)cookie; + if (!ctx->tensor.defined()) { + auto q = ctx->like_tensor; + if constexpr (kRequireZeros) { + ctx->tensor = at::zeros(q.sizes(), + q.options().dtype(at::kFloat)); + } else { + ctx->tensor = at::empty_like(q); + } + } + return mk_aotensor(ctx->tensor, ctx->tensor_name); + } + + static void dispose(void* cookie) { + } +}; + +template +aotriton::LazyTensor mklazy_common(LazyTensorContext* cookie) +{ + using LTF = LazyTensorFunctions; + return aotriton::LazyTensor { + .cookie = cookie, + .acquire = <F::acquire, + .dispose = <F::dispose + }; +} + +template +auto mklazy_empty_like(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + + +// Note: this will not keep the original strides +template +auto mklazy_fp32zeros(LazyTensorContext* cookie) +{ + return mklazy_common(cookie); +} + +#endif // >= 0.11 + +} // namespace aotriton_adapter + +} // namespace sdp + +namespace at::native { + +inline int64_t ceil_div(int64_t numerator, int64_t denominator) { + return (numerator + (denominator - 1)) / denominator; +} + +} + +#endif // USE_ROCM + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_versions.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_versions.h new file mode 100644 index 0000000000000000000000000000000000000000..c284695f7a503789fb0843725ac5d21da0cb82ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/aotriton_versions.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#ifdef USE_ROCM + +#define AOTRITON_VERSION_INT(x, y) (x * 100 + y) +#define AOTRITON_VERSION_CURRENT (AOTRITON_VERSION_MAJOR * 100 + AOTRITON_VERSION_MINOR) + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 11) +#define AOTRITON_ALWAYS_V3_API 1 +#else +#define AOTRITON_ALWAYS_V3_API 0 +#endif + +#if AOTRITON_VERSION_CURRENT >= AOTRITON_VERSION_INT(0, 10) +#define AOTRITON_V3_API 1 +#else +#define AOTRITON_V3_API 0 +#endif + +#endif + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h new file mode 100644 index 0000000000000000000000000000000000000000..960eb5f101bc4ef5fe4c67222f91799cd53fbbe5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/flash_attn/ck/me_ck_api.h @@ -0,0 +1,72 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#include + +#if defined(USE_ROCM_CK_SDPA) +namespace pytorch_flash { + +std::tuple< + at::Tensor, // output + at::Tensor, // q + at::Tensor, // k + at::Tensor, // v + at::Tensor, // lse + at::Tensor, // seed + at::Tensor, // offset + at::Tensor> // dropout randval +mem_eff_forward_ck( + const at::Tensor& q, + const at::Tensor& k, + const at::Tensor& v, + float p_dropout, + bool return_dropout_randval, + std::optional is_causal, + std::optional scale, + const std::optional& attn_bias_, + std::optional& out_, + const std::optional& cu_seqlens_q, + const std::optional& cu_seqlens_k, + const std::optional& seqstart_q, + const std::optional& seqstart_k, + std::optional gen_, + std::optional& seqused_k_ +); + +std::tuple< + at::Tensor, // dQ + at::Tensor, // dK + at::Tensor, // dV + at::Tensor> // dBias +mem_eff_backward_ck( + const at::Tensor &dout, + const at::Tensor &q, + const at::Tensor &k, + const at::Tensor &v, + const at::Tensor &out, + const at::Tensor &softmax_lse, + const at::Tensor &dq_, + const at::Tensor &dk_, + const at::Tensor &dv_, + std::optional &attn_bias, + bool bias_requires_grad, + std::optional &grad_bias, + std::optional &cu_seqlens_q, + std::optional &cu_seqlens_k, + int max_seqlen_q, + int max_seqlen_k, + float p_dropout, + float scale, + bool is_causal, + bool deterministic, + bool zero_tensors, + const at::Tensor philox_seed, + const at::Tensor philox_offset); + +} // namespace pytorch_flash +#endif // USE_ROCM_CK_SDPA + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/flash_attn/flash_api.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/flash_attn/flash_api.h new file mode 100644 index 0000000000000000000000000000000000000000..a8248d01a14e583ac8889d555d047877a7e7821a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/flash_attn/flash_api.h @@ -0,0 +1,568 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +#include +#include +#include + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } + +namespace pytorch_flash { + +// AOTriton Implementation +TORCH_API +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor> +mha_fwd_aot( + const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + out_, // batch_size x seqlen_q x num_heads x head_size + std::optional& + alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, + const float softmax_scale, + bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const bool return_softmax, + const std::optional& gen_); + +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor> +mha_varlen_fwd_aot( + const at::Tensor& + q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor& + k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& + v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional& + out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& cu_seqlens_q, // b+1 + const at::Tensor& cu_seqlens_k, // b+1 + std::optional& + seqused_k, // b. If given, only this many elements of each batch + // element's keys are used. + std::optional& block_table_, + std::optional& alibi_slopes_, // num_heads or b x num_heads + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const bool return_softmax, + const std::optional& gen_); + +std::tuple mha_bwd_aot( + const at::Tensor& dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& softmax_lse, // b x h x seqlen_q + std::optional& + dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional& + dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const bool deterministic, + const at::Tensor& philox_seed, + const at::Tensor& philox_offset); + +std::tuple mha_varlen_bwd_aot( + const at::Tensor& dout, // total_q x num_heads, x head_size + const at::Tensor& + q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor& + k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& + v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& out, // total_q x num_heads x head_size + const at::Tensor& softmax_lse, // b x h x s softmax logsumexp + std::optional& + dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional& + dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional& + dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& cu_seqlens_q, // b+1 + const at::Tensor& cu_seqlens_k, // b+1 + std::optional& alibi_slopes_, // num_heads or b x num_heads + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const bool deterministic, + const at::Tensor& philox_seed, + const at::Tensor& philox_offset); + +#if defined(USE_ROCM_CK_SDPA) +// CK implementation +TORCH_API +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor> +mha_fwd_ck( + const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + out_, // batch_size x seqlen_q x num_heads x head_size + const float p_dropout, + const float softmax_scale, + bool is_causal, + int window_size_left, + int window_size_right, + const bool return_softmax, + std::optional gen_, + const std::optional& attn_bias_); // batch_size x nheads x seqlen_q x seqlen_k + +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor> +mha_varlen_fwd_ck( + const at::Tensor& + q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor& + k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& + v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional& + out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& cu_seqlens_q, // b+1 + const at::Tensor& cu_seqlens_k, // b+1 + std::optional& + seqused_k, // b. If given, only this many elements of each batch + // element's keys are used. + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + int window_size_left, + int window_size_right, + const bool return_softmax, + std::optional gen_, + const std::optional& attn_bias_); + +std::tuple mha_bwd_ck( + const at::Tensor& dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& softmax_lse, // b x h x seqlen_q + std::optional& + dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional& + dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + attn_bias_, // batch_size x num_heads x seqlen_q x seqlen_k + bool bias_requires_grad, + std::optional& grad_bias, + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + int window_size_left, + int window_size_right, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset); + +std::tuple mha_varlen_bwd_ck( + const at::Tensor& dout, // total_q x num_heads, x head_size + const at::Tensor& + q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor& + k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& + v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& out, // total_q x num_heads x head_size + const at::Tensor& softmax_lse, // b x h x s softmax logsumexp + std::optional& + dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional& + dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional& + dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& cu_seqlens_q, // b+1 + const at::Tensor& cu_seqlens_k, // b+1 + std::optional& attn_bias_, // num_heads or b x num_heads + bool bias_requires_grad, + std::optional& grad_bias, + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + int window_size_left, + int window_size_right, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset); +#endif + +TORCH_API +std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor> +mha_fwd( + const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + out_, // batch_size x seqlen_q x num_heads x head_size + std::optional& + alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, + const float softmax_scale, + bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const float softcap, + const bool return_softmax, + std::optional gen_); + +inline std::tuple< + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor, + at::Tensor> +mha_varlen_fwd( + const at::Tensor& + q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor& + k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& + v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional& + out_, // total_q x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& cu_seqlens_q, // b+1 + const at::Tensor& cu_seqlens_k, // b+1 + std::optional& + seqused_k, // b. If given, only this many elements of each batch + // element's keys are used. + std::optional& + block_table_, // Not used on ROCm. Keeping for parity with CUDA + std::optional& alibi_slopes_, // num_heads or b x num_heads + int max_seqlen_q, + const int max_seqlen_k, + const float p_dropout, + const float softmax_scale, + const bool zero_tensors, + bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const float softcap, + const bool return_softmax, + std::optional gen_) { +#if defined(USE_ROCM_CK_SDPA) + if (at::globalContext().getROCmFAPreferredBackend() == + at::ROCmFABackend::Ck) { + std::optional dummy_attn_bias = std::nullopt; + const int non_null_window_left = window_size_left.value_or(-1); + const int non_null_window_right = window_size_right.value_or(-1); + return mha_varlen_fwd_ck( + q, + k, + v, + out_, + cu_seqlens_q, + cu_seqlens_k, + seqused_k, + max_seqlen_q, + max_seqlen_k, + p_dropout, + softmax_scale, + zero_tensors, + is_causal, + non_null_window_left, + non_null_window_right, + return_softmax, + gen_, + dummy_attn_bias); // Not used in flash attention + } +#endif + return mha_varlen_fwd_aot( + q, + k, + v, + out_, + cu_seqlens_q, + cu_seqlens_k, + seqused_k, + block_table_, + alibi_slopes_, + max_seqlen_q, + max_seqlen_k, + p_dropout, + softmax_scale, + zero_tensors, + is_causal, + window_size_left, + window_size_right, + return_softmax, + gen_); +} + +inline std::tuple mha_bwd( + const at::Tensor& dout, // batch_size x seqlen_q x num_heads, x head_size_og + const at::Tensor& q, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& k, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& v, // batch_size x seqlen_k x num_heads_k x head_size + const at::Tensor& out, // batch_size x seqlen_q x num_heads x head_size + const at::Tensor& softmax_lse, // b x h x seqlen_q + std::optional& + dq_, // batch_size x seqlen_q x num_heads x head_size + std::optional& + dk_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + dv_, // batch_size x seqlen_k x num_heads_k x head_size + std::optional& + alibi_slopes_, // num_heads or batch_size x num_heads + const float p_dropout, // probability to drop + const float softmax_scale, + const bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const float softcap, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset) { + +#if defined(USE_ROCM_CK_SDPA) + if (at::globalContext().getROCmFAPreferredBackend() == + at::ROCmFABackend::Ck) { + std::optional non_null_dbias = std::nullopt; + const int non_null_window_left = window_size_left.value_or(-1); + const int non_null_window_right = window_size_right.value_or(-1); + auto[dQuery, + dKey, + dValue, + dSoftmax, + dBias] = mha_bwd_ck( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + alibi_slopes_, + false, // bias_requires_grad + non_null_dbias, + p_dropout, + softmax_scale, + is_causal, + non_null_window_left, + non_null_window_right, + deterministic, + philox_seed, + philox_offset); + // for FA return [dQ, dV, dK, dSoftmax] + return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax)); + } +#endif + return mha_bwd_aot( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + alibi_slopes_, + p_dropout, + softmax_scale, + is_causal, + window_size_left, + window_size_right, + deterministic, + philox_seed, + philox_offset); +} + +inline std::tuple mha_varlen_bwd( + const at::Tensor& dout, // total_q x num_heads, x head_size + const at::Tensor& + q, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + const at::Tensor& + k, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& + v, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& out, // total_q x num_heads x head_size + const at::Tensor& softmax_lse, // b x h x s softmax logsumexp + std::optional& + dq_, // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i + std::optional& + dk_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + std::optional& + dv_, // total_k x num_heads_k x head_size, total_k := \sum_{i=0}^{b} s_i + const at::Tensor& cu_seqlens_q, // b+1 + const at::Tensor& cu_seqlens_k, // b+1 + std::optional& alibi_slopes_, // num_heads or b x num_heads + const int max_seqlen_q, + const int max_seqlen_k, // max sequence length to choose the kernel + const float p_dropout, // probability to drop + const float softmax_scale, + const bool zero_tensors, + const bool is_causal, + std::optional window_size_left, + std::optional window_size_right, + const float softcap, + const bool deterministic, + const at::Tensor philox_seed, + const at::Tensor philox_offset) { +#if defined(USE_ROCM_CK_SDPA) + if (at::globalContext().getROCmFAPreferredBackend() == + at::ROCmFABackend::Ck) { + std::optional non_null_dbias = std::nullopt; + const int non_null_window_left = window_size_left.value_or(-1); + const int non_null_window_right = window_size_right.value_or(-1); + auto[dQuery, + dKey, + dValue, + dSoftmax, + dBias] = mha_varlen_bwd_ck( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + cu_seqlens_q, + cu_seqlens_k, + alibi_slopes_, + false, // bias_requires_grad + non_null_dbias, + max_seqlen_q, + max_seqlen_k, + p_dropout, + softmax_scale, + zero_tensors, + is_causal, + non_null_window_left, + non_null_window_right, + deterministic, + philox_seed, + philox_offset); + // for FA return [dQ, dV, dK, dSoftmax] + return std::make_tuple(std::move(dQuery), std::move(dKey), std::move(dValue), std::move(dSoftmax)); + } +#endif + return mha_varlen_bwd_aot( + dout, + q, + k, + v, + out, + softmax_lse, + dq_, + dk_, + dv_, + cu_seqlens_q, + cu_seqlens_k, + alibi_slopes_, + max_seqlen_q, + max_seqlen_k, + p_dropout, + softmax_scale, + zero_tensors, + is_causal, + window_size_left, + window_size_right, + deterministic, + philox_seed, + philox_offset); +} + +} // namespace pytorch_flash + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/gemm_kernel_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/gemm_kernel_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..c1ef5d0ce501e14c665cc767fa51400938c01ec9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/hip/gemm_kernel_utils.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// This file is a trimmed version of cuda/mem_eff_attention/gemm_kernel_utils.h +#pragma once + +#define CHECK_NOSPARSE_CONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK(TENSOR.is_contiguous()); + +#define CHECK_NOSPARSE_LASTCONTIGUOUS_CUDA(TENSOR) \ + TORCH_CHECK(TENSOR.is_cuda(), #TENSOR " must be a CUDA tensor"); \ + TORCH_CHECK(!TENSOR.is_sparse(), #TENSOR " must be a dense tensor"); \ + TORCH_CHECK( \ + TENSOR.stride(-1) == 1, #TENSOR ": last dimension must be contiguous"); + +#define CHECK_ALIGNED_PTR(PTR, ALIGNMENT) \ + TORCH_CHECK( \ + uint64_t(PTR) % ALIGNMENT == 0, #PTR " is not correctly aligned") + +#define ASSIGN_CHECK_OVERFLOW(A, B) \ + { \ + A = B; \ + TORCH_CHECK( \ + B < std::numeric_limits::max(), #B " overflows"); \ + } + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f0d1ee26d31ec437ed2d66b6f8eedfb25277736d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils.h @@ -0,0 +1,93 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include + +namespace at::native { + +void alloc_with_matching_layout( + const Tensor& q, + Tensor& output, + const std::vector& shape) { + TORCH_INTERNAL_ASSERT( + shape.size() == q.sizes().size(), + "SDPA alloc_with_matching_layout got requested shape ndim != q ndim"); + + if (std::equal(q.sizes().begin(), q.sizes().end(), shape.begin())) { + output = at::empty_like(q); + return; + } + + // get the "fill order," which is just an argsort on the strides + std::vector fill_order(shape.size()); + std::iota(fill_order.begin(), fill_order.end(), 0); + const auto q_strides = q.strides(); + std::stable_sort( + fill_order.begin(), fill_order.end(), [&q_strides](int idx1, int idx2) { + return q_strides[idx1] ? q_strides[idx1] : 1 < q_strides[idx2] ? q_strides[idx2] : 1; + }); + std::vector ordered_strides(shape.size()); + int64_t current_stride = 1; + for (const int dim_idx : fill_order) { + ordered_strides[dim_idx] = current_stride; + current_stride *= shape[dim_idx]; + } + output = at::empty(at::IntArrayRef(shape), q.options()) + .as_strided( + at::IntArrayRef(shape), at::IntArrayRef(ordered_strides), 0); +} + +void permute_to_matching_layout(const Tensor& output, Tensor& grad_output) { + const int dims = output.sizes().size(); + std::vector outer_to_inner(dims); + std::iota(outer_to_inner.begin(), outer_to_inner.end(), 0); + const auto o_strides = output.strides(); + std::stable_sort( + outer_to_inner.begin(), + outer_to_inner.end(), + [&o_strides](int idx1, int idx2) { + return o_strides[idx1] > o_strides[idx2]; + }); + std::vector inverse(dims); + for (int d = 0; d < dims; d++) { + inverse[d] = std::find(outer_to_inner.begin(), outer_to_inner.end(), d) - + outer_to_inner.begin(); + } + grad_output = grad_output.permute(at::IntArrayRef(outer_to_inner)) + .contiguous() + .permute(at::IntArrayRef(inverse)); +} + +bool same_strides(const Tensor& t1, const Tensor& t2) { + std::vector t1_strides_no_ones; + std::vector t2_strides_no_ones; + const auto t1strides = t1.strides(); + const auto t2strides = t2.strides(); + const int dim = t1strides.size(); + if (dim != (int)t2strides.size()) { + return false; + } + const auto t1sizes = t1.sizes(); + const auto t2sizes = t2.sizes(); + + // we are going through strides backward here, but if both are backward it's + // comparable + for (int i = 0; i < dim; i++) { + if (t1sizes[i] > 1) { + t1_strides_no_ones.push_back(t1strides[i]); + } + if (t2sizes[i] > 1) { + t2_strides_no_ones.push_back(t2strides[i]); + } + } + return std::equal( + t1_strides_no_ones.begin(), + t1_strides_no_ones.end(), + t2_strides_no_ones.begin(), + t2_strides_no_ones.end()); +} +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h new file mode 100644 index 0000000000000000000000000000000000000000..6ffddcdb008d1a6faf8e31187bc166d6581d7755 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h @@ -0,0 +1,565 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace sdp { + +constexpr int32_t num_backends = at::num_sdp_backends; +using SDPBackend = at::SDPBackend; + +// Note that if this changed make sure to update +// the templated enum in mem_eff/kernel_forward.h and mem_eff/kernel_backward.h +enum class CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, +}; + +struct sdp_params { + at::Tensor query; + at::Tensor key; + at::Tensor value; + std::optional attn_mask; + double dropout; + bool is_causal; + bool enable_gqa; +}; + +SDPBackend select_sdp_backend_cpp(sdp_params const& kernel_params); + +inline c10::SymFloat calculate_scale( + const at::Tensor& query, + std::optional scale) { + const auto softmax_scale = scale.has_value() + ? scale.value() + : (c10::SymFloat(1.0) / (c10::SymFloat(query.sym_size(-1)).sqrt())); + return c10::SymFloat(softmax_scale); +} + +inline bool input_requires_grad(sdp_params const& params) { + const bool any_inputs_require_grad = params.query.requires_grad() || + params.key.requires_grad() || params.value.requires_grad(); + const bool gradmode_enabled = at::GradMode::is_enabled(); + return any_inputs_require_grad && gradmode_enabled; +} + +inline bool has_for_nested_inputs(sdp_params const& params) { + return + (params.query.is_nested() && params.query.layout() == c10::kStrided) || + (params.key.is_nested() && params.key.layout() == c10::kStrided) || + (params.value.is_nested() && params.value.layout() == c10::kStrided); +} + +inline bool has_for_dense_inputs(sdp_params const& params) { + return !params.query.is_nested() || !params.key.is_nested() || !params.value.is_nested(); +} + +inline bool has_only_dense_inputs(sdp_params const& params) { + return !params.query.is_nested() && !params.key.is_nested() && !params.value.is_nested(); +} + +template +inline bool check_tensor_dtype( + sdp_params const& params, + dtype_vector allowed_dtypes, + bool debug) { + auto query_dtype = params.query.dtype(); + if (!(query_dtype == params.key.dtype() && + query_dtype == params.value.dtype() && + (std::find(allowed_dtypes.begin(), allowed_dtypes.end(), query_dtype) != + allowed_dtypes.end()))) { + if (debug) { + TORCH_WARN( + "Expected query, key and value to all be of dtype: {", + c10::Join(", ", allowed_dtypes), + "}. Got ", + "Query dtype: ", + params.query.dtype(), + ", Key dtype: ", + params.key.dtype(), + ", and Value dtype: ", + params.value.dtype(), + " instead."); + } + return false; + } + return true; +} + + +inline bool try_broadcast_param_size( + const c10::SymInt q_size, + const c10::SymInt k_size, + const c10::SymInt v_size, + std::string_view param_name, + bool debug) { + auto max_size = std::max({q_size, k_size, v_size}); + if ((q_size != max_size && q_size != 1) || + (k_size != max_size && k_size != 1) || + (v_size != max_size && v_size != 1)) { + if (debug) { + TORCH_WARN( + "Both fused kernels require query, key and value to have broadcastable ", + param_name, + "got Query ", + param_name, + q_size, + ", Key ", + param_name, + k_size, + ", Value ", + param_name, + v_size, + " instead."); + } + return false; + } + return true; +} + +inline bool check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + at::Tensor const& param, + std::string_view param_name, + bool debug) { + const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param); + const at::Tensor& sizes = nt_tensor_impl->get_nested_sizes(); + auto num_head_dims = nt_tensor_impl->opt_size(1); + if (!num_head_dims.has_value()) { + // num_head_dims is ragged + if (debug) { + TORCH_WARN( + "Fused kernels do not support ragged num_head_dims, ", + param_name, + "has a ragged num_heads."); + } + return false; + } + + auto* sizes_ptr = sizes.data_ptr(); + const int64_t n_tensors = param.size(0); + const int64_t size_tensor_stride = sizes.stride(0); + + // This is being called inside sdp with shape [batch, heads, {seq_len}, dim] + for (const auto i : c10::irange(n_tensors)) { + if (sizes_ptr[(i * size_tensor_stride) + 1] == 0) { + if (debug) { + TORCH_WARN( + "Fused kernels do not support seq_len == 0, ", + param_name, + "has a seq len of 0."); + } + return false; + } + } + return true; +} + +inline bool check_for_seq_len_0_nested_tensor(sdp_params const& params, bool debug) { + // When this function is called we are assured that the nt is dim==4 + bool q_is_safe = params.query.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.query, "query ", debug) + : true; + // short circuit if any is unsafe + if (!q_is_safe) { + return false; + } + + bool k_is_safe = params.key.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.key, "key ", debug) + : true; + if (!k_is_safe) { + return false; + } + + bool v_is_safe = params.value.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.value, "value ", debug) + : true; + if (!v_is_safe) { + return false; + } + + // We now know none of the inputs have ragged num_heads, so we can safely + // access .size(1) + auto q_num_heads = params.query.size(1); + auto k_num_heads = params.key.size(1); + auto v_num_heads = params.value.size(1); + bool same_num_heads = + q_num_heads == k_num_heads && q_num_heads == v_num_heads; + + if (!same_num_heads) { + if (input_requires_grad(params)){ + if (debug) { + TORCH_WARN( + "Both fused kernels do not support training with broadcasted NT inputs."); + } + return false; + } + return try_broadcast_param_size( + q_num_heads, k_num_heads, v_num_heads, "num heads ", debug); + } + + return true; +} + +inline bool check_nested_tensor(sdp_params const& params, bool debug) { + // Return false if have nested tensor + if (!has_only_dense_inputs(params)) { + if (debug) { + TORCH_WARN( + "Both fused kernels of cpp version currently do not support Nested Tensor inputs."); + } + return false; + } + return true; +} + +inline bool check_for_dropout(sdp_params const& params, bool debug) { + if (params.dropout > 0.0) { + if (debug) { + TORCH_WARN("Both fused kernels do not support non-zero dropout."); + } + return false; + } + return true; +} + +inline bool check_requires_grad_and_nested(sdp_params const& params, bool debug) { + if (input_requires_grad(params)) { + if (debug) { + TORCH_WARN( + "Memory efficient attention currently doesn't support training with NT inputs."); + } + return false; + } + return true; +} + +inline bool check_for_attn_mask(sdp_params const& params, bool debug) { + if (params.attn_mask.has_value()) { + if (debug) { + TORCH_WARN("Flash Attention does not support non-null attn_mask."); + } + return false; + } + return true; +} + +inline bool check_attn_mask_shape(sdp_params const& params, bool debug) { + auto attn_mask = params.attn_mask; + if (!attn_mask.has_value()) { + return true; + } + if (attn_mask.value().requires_grad()) { + return false; + } + auto batchSize = params.query.sym_size(0); + auto qSize = params.query.sym_size(2); + auto kvSize = params.key.sym_size(2); + auto num_head = params.query.sym_size(1); + if (attn_mask.value().sym_size(-2) != qSize && attn_mask.value().sym_size(-2) != 1) { + return false; + } + if (attn_mask.value().sym_size(-1) != kvSize && attn_mask.value().sym_size(-1) != 1) { + return false; + } + if (attn_mask.value().dim() == 2) { + return true; + } else if (attn_mask.value().dim() == 4) { + if ((attn_mask.value().sym_size(0) == 1 || attn_mask.value().sym_size(0) == batchSize) + && (attn_mask.value().sym_size(1) == 1 || attn_mask.value().sym_size(1) == num_head)) { + return true; + } + } + if (debug) { + TORCH_WARN("Please use the following attn mask shapes: ", + "2d - ({Q_seq_len, 1} x {KV_seq_len, 1}); ", + "4d - ({Batch, 1} x {Num_heads, 1} x {Q_seq_len, 1} x {KV_seq_len, 1})"); + } + return false; +} + +inline bool check_tensor_shapes(sdp_params const& params, bool debug) { + auto query_dim = params.query.dim(); + if (!(query_dim == params.key.dim() && query_dim == params.value.dim() && + (query_dim == 4))) { + if (debug) { + TORCH_WARN( + "All fused kernels requires query, key and value to be 4 dimensional, but got Query dim: ", + query_dim, + ", Key dim: ", + params.key.dim(), + ", Value dim: ", + params.value.dim(), + " instead."); + } + return false; + } + return true; +} + +inline bool check_safe_kv_broadcast(at::Tensor const& param, bool debug) { + const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param); + auto seq_len = nt_tensor_impl->opt_size(2); + if (!seq_len.has_value()) { + if (debug) { + TORCH_WARN( + "For both fused kernels, if one of key/value batch_size requires " + "broadcasting and the other does not, then the other must have a ", + "consistent seq_len dim.") + } + return false; + } + return true; +} + +template +inline bool check_grouped_query_attention(sdp_params const& params, bool debug) { + const auto q_num_heads = params.query.sym_size(-3); + const auto k_num_heads = params.key.sym_size(-3); + const auto v_num_heads = params.value.sym_size(-3); + const bool same_kv_heads = k_num_heads == v_num_heads; + + if (requires_same_num_heads && !same_kv_heads){ + if (debug) { + TORCH_WARN( + "Both fused kernels require key and value to have the same num_heads and batch_size but got: ", + "Key sizes: ", + params.key.sizes(), + ", Value sizes: ", + params.value.sizes(), + ", Query sizes: ", + params.query.sizes(), + " instead."); + } + return false; + } + // Check if grouped query attention is supported and validate the number of + // heads + if (q_num_heads % k_num_heads != 0 || (!requires_same_num_heads && (q_num_heads % v_num_heads != 0))) { + if (debug) { + TORCH_WARN( + "The number of heads in key/value must divide number of heads in query.", + "Got input Key sizes(): ", + params.key.sym_size(-3), + ", Value sizes(): ", + params.value.sym_size(-3), + ", Query sizes(): ", + params.query.sym_size(-3), + " instead."); + } + return false; + } + return true; +} + +template +inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool debug) { + // This is expected to be called after check_tensor_shapes ensuring that the + // size() calls won't error since the inputs are all 4 dimensional + + auto q_batch_size = params.query.sym_size(0); + auto k_batch_size = params.key.sym_size(0); + auto v_batch_size = params.value.sym_size(0); + + bool same_batch_size = + q_batch_size == k_batch_size && q_batch_size == v_batch_size; + + auto q_num_heads = params.query.sym_size(-3); + auto k_num_heads = params.key.sym_size(-3); + auto v_num_heads = params.value.sym_size(-3); + + bool same_num_heads = + q_num_heads == k_num_heads && q_num_heads == v_num_heads; + + if (!same_batch_size){ + if(debug) { + TORCH_WARN( + "For dense inputs, both fused kernels require query, key and value to have the same batch_size. ", + "Query.sizes(): ", + params.query.sizes(), + ", Key.sizes(): ", + params.key.sizes(), + ", Value.sizes(): ", + params.value.sizes(), + " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel."); + } + return false; + } + + if(params.enable_gqa && supports_gqa){ + return check_grouped_query_attention(params, debug); + } + + // same num heads condition for non-gqa case + if (!same_num_heads){ + if (debug) { + TORCH_WARN( + "For dense input, both fused kernels require query, key and value to have the same num_heads. ", + "Query.sizes(): ", + params.query.sizes(), + ", Key sizes(): ", + params.key.sizes(), + ", Value sizes(): ", + params.value.sizes(), + " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel."); + } + return false; + } + // If all checks pass, return true + return true; +} + +inline bool check_batch_size_nested(sdp_params const& params, bool debug) { + // This is expected to be called after check_tensor_shapes ensuring that the + // size() calls won't error since the inputs are all 4 dimensional + auto q_batch_size = params.query.sym_size(0); + auto k_batch_size = params.key.sym_size(0); + auto v_batch_size = params.value.sym_size(0); + + bool same_batch_size = + q_batch_size == k_batch_size && q_batch_size == v_batch_size; + + // num_heads logic for nested input is checked in + // check_for_seq_len_0_nested_tensor as there is handling there to make sure + // num_heads is not ragged + bool broadcastable_batch_size = true; + if (!same_batch_size) { + if (input_requires_grad(params)){ + if (debug) { + TORCH_WARN( + "Both fused kernels do not support training with broadcasted NT inputs."); + } + return false; + } + // try to broadcast batchsize + broadcastable_batch_size = try_broadcast_param_size( + q_batch_size, k_batch_size, v_batch_size, "batch size ", debug); + + // if only one of k or v require broadcasting of batch size, the other + // must have a consistent seq_len dim + if (broadcastable_batch_size) { + if (k_batch_size == 1 && v_batch_size != 1 && + !check_safe_kv_broadcast(params.value, debug)) { + return false; + } + if (v_batch_size == 1 && k_batch_size != 1 && + !check_safe_kv_broadcast(params.key, debug)) { + return false; + } + } + } + return broadcastable_batch_size; +} + +inline bool check_nonzero_sequence_lengths_dense(sdp_params const& params, bool debug) { + // In some cases people will pass in 0 sized tensors, this will + // cause the fused path to error with unaligned mask + bool zero_seq_len_q = params.query.sym_size(-2) == 0; + bool zero_seq_len_k = params.key.sym_size(-2) == 0; + if (zero_seq_len_q || zero_seq_len_k) { + if (debug) { + TORCH_WARN( + "All fused kernels do not support zero seq_len_q or seq_len_kv."); + } + return false; + } + return true; +} + +template +inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool debug) { + // The stride checking for NestedTensors is done within the kernel + // And .contiguous will be called if needed + + // This function checks that the last dimension of the inputs to + // fused_attention have stride 1 + bool qkv_strides_equal_1 = params.query.sym_stride(-1) == 1 && + params.key.sym_stride(-1) == 1 && params.value.sym_stride(-1) == 1; + + // https://github.com/pytorch/pytorch/issues/116333 + // If the head_dim is size 1 the stride won't matter, but we + // check this condition before padding the head_dim to 1 + if (ignore_singleton_dim){ + qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1; + } + bool is_cpu = params.query.device().type() == c10::DeviceType::CPU; + bool mask_stride_equal_1 = params.attn_mask.has_value() + ? params.attn_mask.value().sym_stride(-1) == 1 + : true; + bool mask_stride_valid = is_cpu ? true : mask_stride_equal_1; + if (!(qkv_strides_equal_1 && mask_stride_valid)) { + if (debug) { + std::ostringstream message; + message + << "All fused kernels require the last dimension of the input to have stride 1. "; + message << "Got Query.stride(-1): " << params.query.sym_stride(-1) + << ", Key.stride(-1): " << params.key.sym_stride(-1) + << ", Value.stride(-1): " << params.value.sym_stride(-1); + + if (params.attn_mask.has_value()) { + message + << ", Attn_mask.stride(-1): " + << params.attn_mask.value().sym_stride(-1) + << " (GPU backends require attn_mask's last dimension to have stride 1 while the CPU does not)."; + } + TORCH_WARN(message.str()); + } + + return false; + } + return true; +} + +inline bool check_runtime_disabled_flash(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of flash + // sdp kernels + if (!at::globalContext().userEnabledFlashSDP()) { + if (debug) { + TORCH_WARN("Flash attention has been runtime disabled."); + } + return false; + } + return true; +} + +inline bool check_runtime_disabled_mem_efficient(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of + // mem_efficient sdp kernels + if (!at::globalContext().userEnabledMemEfficientSDP()) { + if (debug) { + TORCH_WARN("Memory Efficient attention has been runtime disabled."); + } + return false; + } + return true; +} + + +} // namespace sdp + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/xpu/sdp_utils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/xpu/sdp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b2ddf6d52bc58be0bb901a0dee0375cefc3224a5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/transformers/xpu/sdp_utils.h @@ -0,0 +1,22 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include +#include +#include + +namespace sdp { + +C10_EXPORT bool is_flash_attention_available(); +C10_EXPORT bool can_use_flash_attention(sdp_params const& params, bool debug); +C10_EXPORT bool check_flash_attention_hardware_support( + sdp_params const& params, + bool debug); + +} // namespace sdp + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/Factory.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/Factory.h new file mode 100644 index 0000000000000000000000000000000000000000..f338911e70c0d2356d58f340a937d16798a65fec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/Factory.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include + +namespace at::native::mobile { + +Tensor allocate_padded_contiguous_if_needed( + const Tensor& input, + c10::MemoryFormat memory_format); + +// TODO: Remove this function when at::native::empty() is modified to accept a +// custom memory allocator. + +at::Tensor empty_with_tail_padding( + IntArrayRef size, + const caffe2::TypeMeta dtype, + c10::MemoryFormat memory_format, + std::optional maybe_names); + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamUtils.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..2d8c66b5e99d8b40b69363ecfa6324247a47454c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamUtils.h @@ -0,0 +1,47 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include + +namespace at { +namespace native { + +template +inline std::vector _expand_param_if_needed( + ArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + if (list_param.size() == 1) { + return std::vector(expected_dim, list_param[0]); + } else if ((int64_t)list_param.size() != expected_dim) { + std::ostringstream ss; + ss << "expected " << param_name << " to be a single integer value or a " + << "list of " << expected_dim << " values to match the convolution " + << "dimensions, but got " << param_name << '=' << list_param; + TORCH_CHECK(false, ss.str()); + } else { + return list_param.vec(); + } +} + +inline std::vector expand_param_if_needed( + IntArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + return _expand_param_if_needed(list_param, param_name, expected_dim); +} + +inline std::vector expand_param_if_needed( + SymIntArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + return _expand_param_if_needed(list_param, param_name, expected_dim); +} + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamsHash.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamsHash.h new file mode 100644 index 0000000000000000000000000000000000000000..46c05e706c9a9d230e6bafadb4d3caa82241c43c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/native/utils/ParamsHash.h @@ -0,0 +1,109 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +#include +#include +#include + +namespace at::native { + +// Hashing machinery for Params +// Fowler–Noll–Vo hash function +// see +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct ParamsHash { + // Params must be a POD because we read out its memory + // contents as char* when hashing + static_assert(std::is_standard_layout_v, "Params is not POD"); + + size_t operator()(const Params& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (const auto i : c10::irange(sizeof(Params))) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +template +struct ParamsEqual { + // Params must be a POD because we read out its memory + // contents as char* when comparing + static_assert(std::is_standard_layout_v, "Params is not POD"); + + bool operator()(const Params& a, const Params& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Params)) == 0; + } +}; + +// Provide explicit byte-for-byte constructors to avoid uwittingly leaving +// padding bytes uninitialized (e.g., when passing Params by value) +template +struct ParamsWrapper { + T pod; + static_assert( + std::is_standard_layout_v, + "ParamsWrapper cannot wrap non-POD data"); + + ParamsWrapper() { + memset(&(this->pod), 0, sizeof(this->pod)); + } + + ParamsWrapper(const ParamsWrapper& other) { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + } + + ParamsWrapper(ParamsWrapper&& other) noexcept { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + } + + ParamsWrapper& operator=(const ParamsWrapper& other) { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + return *this; + } + + ParamsWrapper& operator=(ParamsWrapper&& other) noexcept { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + return *this; + } + + inline friend bool operator==( + const ParamsWrapper& lhs, + const ParamsWrapper& rhs) noexcept { + auto ptr1 = reinterpret_cast(&(lhs.pod)); + auto ptr2 = reinterpret_cast(&(rhs.pod)); + return memcmp(ptr1, ptr2, sizeof(lhs.pod)) == 0; + } +}; + +// Wrapped version: this allows the outer struct to have custom copy and move +// constructors for additional safety +template +struct ParamsWrapperHash { + // Params must be a POD because we read out its memory + // contents as char* when hashing + static_assert( + std::is_standard_layout_v, + "ParamsWrapper cannot wrap non-POD data"); + + size_t operator()(const ParamsWrapper& params_wrapper) const { + auto ptr = reinterpret_cast(&(params_wrapper.pod)); + uint32_t value = 0x811C9DC5; + for (const auto i : c10::irange(sizeof(params_wrapper.pod))) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +} // namespace at::native + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..d68fcec634a241df67faa94606c635d300922a98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); +} +namespace symint { + template >> + at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); + } +} + +// aten::_adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor _adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, output_size); +} +namespace symint { + template >> + at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d::call(self, output_size); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +// aten::_adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::_adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a20c52c189212f6de6cd76eb5dd8d4f850e5a38a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _adaptive_avg_pool2d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self); +TORCH_API at::Tensor & _adaptive_avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..48f9e6a94bb6fd272c4e91ccc5f8a348fb6cd8ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d901f635b8d97afd4b4f66e6571642a5cb2b7109 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor _adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..50d7cce05d2a82eefae74722676792a760103dc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_addmm_activation_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__addmm_activation : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, bool use_gelu); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1498519fc761e2d944fb66f9d5b1b9bf13b91fb2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API void _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, at::Tensor & found_inf, const at::Tensor & inv_scale); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_async_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_async_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9cc228c853f61f90564fde676a002204157d21bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_async_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _assert_async { + using schema = void (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_assert_async"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_assert_async(Tensor self) -> ()"; + static void call(const at::Tensor & self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _assert_async_msg { + using schema = void (const at::Tensor &, c10::string_view); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_assert_async"; + static constexpr const char* overload_name = "msg"; + static constexpr const char* schema_str = "_assert_async.msg(Tensor self, str assert_msg) -> ()"; + static void call(const at::Tensor & self, c10::string_view assert_msg); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::string_view assert_msg); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_scalar.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_scalar.h new file mode 100644 index 0000000000000000000000000000000000000000..ea926a539c2546667b28f6577ebb0f5afb190736 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_assert_scalar.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_assert_scalar(Scalar self, str assert_msg) -> () +inline void _assert_scalar(const at::Scalar & self, c10::string_view assert_msg) { + return at::_ops::_assert_scalar::call(self, assert_msg); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision.h new file mode 100644 index 0000000000000000000000000000000000000000..0cebc6cce88b004f8d9428fcbb1287974ba3f098 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_full_precision.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2582855d91d96152c2428f0e56d519ed5bbe0b08 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_autocast_to_reduced_precision_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _autocast_to_reduced_precision(const at::Tensor & self, bool cuda_enabled, bool cpu_enabled, at::ScalarType cuda_dtype, at::ScalarType cpu_dtype); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..283436a91eb5ee7e7dc6d5b3aa3b55a9ec44027b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_backward.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f41b6017fab3eb8277c60bfbe4a4e4f1309a28e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _backward { + using schema = void (const at::Tensor &, at::TensorList, const ::std::optional &, ::std::optional, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_backward(Tensor self, Tensor[] inputs, Tensor? gradient=None, bool? retain_graph=None, bool create_graph=False) -> ()"; + static void call(const at::Tensor & self, at::TensorList inputs, const ::std::optional & gradient, ::std::optional retain_graph, bool create_graph); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList inputs, const ::std::optional & gradient, ::std::optional retain_graph, bool create_graph); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..179e583e27f877b69011dbaea0a36604081a8328 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_impl_index_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple _batch_norm_impl_index(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double momentum, double eps, bool cudnn_enabled); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_no_update.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_no_update.h new file mode 100644 index 0000000000000000000000000000000000000000..caf169fc868908bf7f24f3ee6818f11e105af58c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_no_update.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) +inline ::std::tuple _batch_norm_no_update(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum, double eps) { + return at::_ops::_batch_norm_no_update::call(input, weight, bias, running_mean, running_var, momentum, eps); +} + +// aten::_batch_norm_no_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!)) +inline ::std::tuple _batch_norm_no_update_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum, double eps) { + return at::_ops::_batch_norm_no_update_out::call(input, weight, bias, running_mean, running_var, momentum, eps, out0, out1, out2, out3); +} +// aten::_batch_norm_no_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!)) +inline ::std::tuple _batch_norm_no_update_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3) { + return at::_ops::_batch_norm_no_update_out::call(input, weight, bias, running_mean, running_var, momentum, eps, out0, out1, out2, out3); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update.h new file mode 100644 index 0000000000000000000000000000000000000000..31cdc8c12cd51444be5fb8d41cd511af646cb66c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) +inline ::std::tuple _batch_norm_with_update(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps) { + return at::_ops::_batch_norm_with_update::call(input, weight, bias, running_mean, running_var, momentum, eps); +} + +// aten::_batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!)) +inline ::std::tuple _batch_norm_with_update_out(at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps) { + return at::_ops::_batch_norm_with_update_out::call(input, weight, bias, running_mean, running_var, momentum, eps, out, save_mean, save_invstd, reserve); +} +// aten::_batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!)) +inline ::std::tuple _batch_norm_with_update_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve) { + return at::_ops::_batch_norm_with_update_out::call(input, weight, bias, running_mean, running_var, momentum, eps, out, save_mean, save_invstd, reserve); +} + +// aten::_batch_norm_with_update_functional(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor, Tensor running_mean_out, Tensor running_var_out) +inline ::std::tuple _batch_norm_with_update_functional(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps) { + return at::_ops::_batch_norm_with_update_functional::call(input, weight, bias, running_mean, running_var, momentum, eps); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..29338eceb392f0b212d418120deefb9931fc9f2f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_batch_norm_with_update_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _batch_norm_with_update { + using schema = ::std::tuple (const at::Tensor &, const ::std::optional &, const ::std::optional &, at::Tensor &, at::Tensor &, double, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_batch_norm_with_update"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps); +}; + +struct TORCH_API _batch_norm_with_update_out { + using schema = ::std::tuple (const at::Tensor &, const ::std::optional &, const ::std::optional &, at::Tensor &, at::Tensor &, double, double, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_batch_norm_with_update"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!))"; + static ::std::tuple call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, at::Tensor & running_mean, at::Tensor & running_var, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, at::Tensor & reserve); +}; + +struct TORCH_API _batch_norm_with_update_functional { + using schema = ::std::tuple (const at::Tensor &, const ::std::optional &, const ::std::optional &, const at::Tensor &, const at::Tensor &, double, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_batch_norm_with_update_functional"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_batch_norm_with_update_functional(Tensor input, Tensor? weight, Tensor? bias, Tensor running_mean, Tensor running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor, Tensor running_mean_out, Tensor running_var_out)"; + static ::std::tuple call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & running_mean, const at::Tensor & running_var, double momentum, double eps); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half.h new file mode 100644 index 0000000000000000000000000000000000000000..53eec94f51c84761eebeeeeeb0e7f624bbb07fb1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cast_Half.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cast_Half(Tensor self, bool non_blocking=False) -> Tensor +inline at::Tensor _cast_Half(const at::Tensor & self, bool non_blocking=false) { + return at::_ops::_cast_Half::call(self, non_blocking); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f9b80465aabf84f59fe9dd9074cfe1c0eaa1011f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _cdist_backward_out(const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist, at::Tensor & out); +TORCH_API at::Tensor _cdist_backward(const at::Tensor & grad, const at::Tensor & x1, const at::Tensor & x2, double p, const at::Tensor & cdist); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_forward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_forward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b4fcb28f70693c906e60d3f842b1b37d9d143004 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cdist_forward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cdist_forward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, double, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cdist_forward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cdist_forward(Tensor x1, Tensor x2, float p, int? compute_mode) -> Tensor"; + static at::Tensor call(const at::Tensor & x1, const at::Tensor & x2, double p, ::std::optional compute_mode); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p, ::std::optional compute_mode); +}; + +struct TORCH_API _cdist_forward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, double, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cdist_forward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cdist_forward.out(Tensor x1, Tensor x2, float p, int? compute_mode, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & x1, const at::Tensor & x2, double p, ::std::optional compute_mode, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p, ::std::optional compute_mode, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesce.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesce.h new file mode 100644 index 0000000000000000000000000000000000000000..91c3009bf8a8d507bc56d71a39ebff0c033c634f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesce.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_coalesce(Tensor self) -> Tensor +inline at::Tensor _coalesce(const at::Tensor & self) { + return at::_ops::_coalesce::call(self); +} + +// aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _coalesce_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::_coalesce_out::call(self, out); +} +// aten::_coalesce.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _coalesce_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::_coalesce_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesced_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesced_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..42fab17ac4cfaaa4aa4e077b9a87c2280830644b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_coalesced_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _coalesced_ { + using schema = at::Tensor & (at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_coalesced_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_coalesced_(Tensor(a!) self, bool coalesced) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, bool coalesced); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, bool coalesced); +}; + +struct TORCH_API _coalesced_out { + using schema = at::Tensor & (const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_coalesced"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_coalesced.out(Tensor self, bool coalesced, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, bool coalesced, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool coalesced, at::Tensor & out); +}; + +struct TORCH_API _coalesced { + using schema = at::Tensor (const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_coalesced"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_coalesced(Tensor self, bool coalesced) -> Tensor"; + static at::Tensor call(const at::Tensor & self, bool coalesced); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool coalesced); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination.h new file mode 100644 index 0000000000000000000000000000000000000000..75598c326840e418fa10703634d8a184f92b910f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor +inline at::Tensor _compute_linear_combination(const at::Tensor & input, const at::Tensor & coefficients) { + return at::_ops::_compute_linear_combination::call(input, coefficients); +} + +// aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _compute_linear_combination_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & coefficients) { + return at::_ops::_compute_linear_combination_out::call(input, coefficients, out); +} +// aten::_compute_linear_combination.out(Tensor input, Tensor coefficients, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _compute_linear_combination_outf(const at::Tensor & input, const at::Tensor & coefficients, at::Tensor & out) { + return at::_ops::_compute_linear_combination_out::call(input, coefficients, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..77d27890489bc3821ab6bb4316e8c353390b3c31 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_compute_linear_combination_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _compute_linear_combination(const at::Tensor & input, const at::Tensor & coefficients); +TORCH_API at::Tensor & _compute_linear_combination_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & coefficients); +TORCH_API at::Tensor & _compute_linear_combination_outf(const at::Tensor & input, const at::Tensor & coefficients, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d51e2250ba2bec1ccd819a556ae4ec0cc3ae36f5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _conj_copy { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_conj_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_conj_copy(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _conj_copy_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_conj_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_conj_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2741d5ba85db43917f2f809bbd2e99a64647f0e0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _conj { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_conj"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_conj(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_physical_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_physical_native.h new file mode 100644 index 0000000000000000000000000000000000000000..62ccfad6ca11267796894962b8086a1a605ecf98 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conj_physical_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _conj_physical(const at::Tensor & self); +TORCH_API at::Tensor & _conj_physical_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor conj_physical_sparse_csr(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conv_depthwise2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conv_depthwise2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4004be60c0cb90c2a2cbff6da2d6bca73e722b6f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_conv_depthwise2d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _conv_depthwise2d_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_conv_depthwise2d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_conv_depthwise2d.out(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out); +}; + +struct TORCH_API _conv_depthwise2d { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_conv_depthwise2d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_conv_depthwise2d(Tensor self, Tensor weight, SymInt[2] kernel_size, Tensor? bias, SymInt[2] stride, SymInt[2] padding, SymInt[2] dilation) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03984750d67e6107a376d38216fd3936a3d79192 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_indices_from_coo_to_csr_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor _convert_indices_from_coo_to_csr(const at::Tensor & self, int64_t size, bool out_int32=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9f1af5cf71aa5c059c687d602ef44442f503ba57 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convert_weight_to_int4pack_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _convert_weight_to_int4pack_cuda(const at::Tensor & self, int64_t innerKTiles); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution.h new file mode 100644 index 0000000000000000000000000000000000000000..00139d482f9f12248dc55b91947a498dc8025d3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +inline at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32); + } +} + +// aten::_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor +inline at::Tensor _convolution_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32); + } +} + +// aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +inline at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); + } +} + +// aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, int[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor +inline at::Tensor _convolution_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); +} +namespace symint { + template >> + at::Tensor _convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled) { + return at::_ops::_convolution_deprecated::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_symint_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +// aten::_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _convolution_symint_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); +} +namespace symint { + template >> + at::Tensor & _convolution_outf(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, at::Tensor & out) { + return at::_ops::_convolution_out::call(input, weight, bias, stride, padding, dilation, transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled, allow_tf32, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_mode_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_mode_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..aa1084d69c07929d4e7cdfdd5bd85b446cb9c62f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_mode_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _convolution_mode { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::string_view, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_convolution_mode"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_convolution_mode(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, str padding, SymInt[] dilation, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_and_resize.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_and_resize.h new file mode 100644 index 0000000000000000000000000000000000000000..d6da5cac1012e77db60dbc9702ebf2448a18dcf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_and_resize.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_copy_from_and_resize(Tensor self, Tensor dst) -> Tensor +inline at::Tensor _copy_from_and_resize(const at::Tensor & self, const at::Tensor & dst) { + return at::_ops::_copy_from_and_resize::call(self, dst); +} + +// aten::_copy_from_and_resize.out(Tensor self, Tensor dst, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _copy_from_and_resize_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & dst) { + return at::_ops::_copy_from_and_resize_out::call(self, dst, out); +} +// aten::_copy_from_and_resize.out(Tensor self, Tensor dst, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _copy_from_and_resize_outf(const at::Tensor & self, const at::Tensor & dst, at::Tensor & out) { + return at::_ops::_copy_from_and_resize_out::call(self, dst, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_and_resize_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_and_resize_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f6d980b374b4da8e5465ccb3a9b0a7157b65d7f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_and_resize_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _copy_from_and_resize_out(const at::Tensor & self, const at::Tensor & dst, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8d567662e311c16216640cdaf57b1c4eb1abd8d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_copy_from_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _copy_from { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_copy_from"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_copy_from(Tensor self, Tensor dst, bool non_blocking=False) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & dst, bool non_blocking); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, bool non_blocking); +}; + +struct TORCH_API _copy_from_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_copy_from"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_copy_from.out(Tensor self, Tensor dst, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & dst, bool non_blocking, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & dst, bool non_blocking, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_compress_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_compress_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..48854286aea2258bda196c662d50ca1c50619edb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_compress_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cslt_compress { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cslt_compress"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cslt_compress(Tensor input) -> Tensor"; + static at::Tensor call(const at::Tensor & input); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search.h new file mode 100644 index 0000000000000000000000000000000000000000..848ddc0ba5978505aad7577a45d2bc1c5e618efb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int +inline int64_t _cslt_sparse_mm_search(const at::Tensor & compressed_A, const at::Tensor & dense_B, const ::std::optional & bias={}, const ::std::optional & alpha={}, ::std::optional out_dtype=::std::nullopt, bool transpose_result=false) { + return at::_ops::_cslt_sparse_mm_search::call(compressed_A, dense_B, bias, alpha, out_dtype, transpose_result); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fc44f65860f09d2861448b9929070abeb8bcae96 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cslt_sparse_mm_search_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cslt_sparse_mm_search { + using schema = int64_t (const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, ::std::optional, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cslt_sparse_mm_search"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cslt_sparse_mm_search(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False) -> int"; + static int64_t call(const at::Tensor & compressed_A, const at::Tensor & dense_B, const ::std::optional & bias, const ::std::optional & alpha, ::std::optional out_dtype, bool transpose_result); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_A, const at::Tensor & dense_B, const ::std::optional & bias, const ::std::optional & alpha, ::std::optional out_dtype, bool transpose_result); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_ctc_loss_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_ctc_loss_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..247dd29836de80b77b780d31022f4035bb80ea3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_ctc_loss_backward_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _ctc_loss_backward(const at::Tensor & grad, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, const at::Tensor & neg_log_likelihood, const at::Tensor & log_alpha, int64_t blank, bool zero_infinity=false); +TORCH_API at::Tensor _ctc_loss_backward(const at::Tensor & grad, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, const at::Tensor & neg_log_likelihood, const at::Tensor & log_alpha, int64_t blank, bool zero_infinity=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_ctc_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_ctc_loss_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b4ecd806780ffe27269f9cff20ac79895290c097 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_ctc_loss_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _ctc_loss { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_ctc_loss"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool zero_infinity); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool zero_infinity); +}; + +struct TORCH_API _ctc_loss_Tensor { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_ctc_loss"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool zero_infinity); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool zero_infinity); +}; + +struct TORCH_API _ctc_loss_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_ctc_loss"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); +}; + +struct TORCH_API _ctc_loss_Tensor_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_ctc_loss"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "_ctc_loss.Tensor_out(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank=0, bool zero_infinity=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool zero_infinity, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss.h new file mode 100644 index 0000000000000000000000000000000000000000..ee9ca2bae229a4baf6a0529401fd079bfd5db686 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_ctc_loss.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) +inline ::std::tuple _cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity) { + return at::_ops::_cudnn_ctc_loss::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity); +} + +// aten::_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank, bool deterministic, bool zero_infinity) -> (Tensor, Tensor) +inline ::std::tuple _cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank, bool deterministic, bool zero_infinity) { + return at::_ops::_cudnn_ctc_loss_Tensor::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity); +} + +// aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _cudnn_ctc_loss_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity) { + return at::_ops::_cudnn_ctc_loss_out::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity, out0, out1); +} +// aten::_cudnn_ctc_loss.out(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank, bool deterministic, bool zero_infinity, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _cudnn_ctc_loss_outf(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank, bool deterministic, bool zero_infinity, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::_cudnn_ctc_loss_out::call(log_probs, targets, input_lengths, target_lengths, blank, deterministic, zero_infinity, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state.h new file mode 100644 index 0000000000000000000000000000000000000000..4ac592051664f5e384e05d5ccedabdd32f7e9703 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, at::TensorOptions options) { + return at::_ops::_cudnn_init_dropout_state::call(dropout, train, dropout_seed, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::_cudnn_init_dropout_state::call(dropout, train, dropout_seed, dtype, layout, device, pin_memory); +} + +// aten::_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _cudnn_init_dropout_state_out(at::Tensor & out, double dropout, bool train, int64_t dropout_seed) { + return at::_ops::_cudnn_init_dropout_state_out::call(dropout, train, dropout_seed, out); +} +// aten::_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _cudnn_init_dropout_state_outf(double dropout, bool train, int64_t dropout_seed, at::Tensor & out) { + return at::_ops::_cudnn_init_dropout_state_out::call(dropout, train, dropout_seed, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7f8d410d77e4bff3e29e0268f9444eebd0e8da1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _cudnn_init_dropout_state_out(at::Tensor & out, double dropout, bool train, int64_t dropout_seed); +TORCH_API at::Tensor & _cudnn_init_dropout_state_outf(double dropout, bool train, int64_t dropout_seed, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5a503d65ba1d2eaa098f002ed0799b29e6f42647 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, at::TensorOptions options); +TORCH_API at::Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bf36a0b5a2f2814e08362e3f829438df8fd4bb86 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_init_dropout_state_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cudnn_init_dropout_state { + using schema = at::Tensor (double, bool, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_init_dropout_state"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cudnn_init_dropout_state(float dropout, bool train, int dropout_seed, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor"; + static at::Tensor call(double dropout, bool train, int64_t dropout_seed, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API _cudnn_init_dropout_state_out { + using schema = at::Tensor & (double, bool, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_init_dropout_state"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cudnn_init_dropout_state.out(float dropout, bool train, int dropout_seed, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(double dropout, bool train, int64_t dropout_seed, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, double dropout, bool train, int64_t dropout_seed, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_rnn_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_rnn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..45f442955097aee3bff265b9cfa4bd2d27157a24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_cudnn_rnn_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _cudnn_rnn { + using schema = ::std::tuple (const at::Tensor &, at::TensorList, int64_t, const ::std::optional &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, c10::SymInt, int64_t, bool, double, bool, bool, c10::SymIntArrayRef, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_rnn"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_cudnn_rnn(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state) -> (Tensor, Tensor, Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state); +}; + +struct TORCH_API _cudnn_rnn_out { + using schema = ::std::tuple (const at::Tensor &, at::TensorList, int64_t, const ::std::optional &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, c10::SymInt, int64_t, bool, double, bool, bool, c10::SymIntArrayRef, const ::std::optional &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_cudnn_rnn"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_cudnn_rnn.out(Tensor input, Tensor[] weight, int weight_stride0, Tensor? weight_buf, Tensor hx, Tensor? cx, int mode, SymInt hidden_size, SymInt proj_size, int num_layers, bool batch_first, float dropout, bool train, bool bidirectional, SymInt[] batch_sizes, Tensor? dropout_state, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))"; + static ::std::tuple call(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const ::std::optional & weight_buf, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, c10::SymIntArrayRef batch_sizes, const ::std::optional & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..984a0d5072a750908230258c7b29e0af0b65d68e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _debug_has_internal_overlap { + using schema = int64_t (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_debug_has_internal_overlap"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_debug_has_internal_overlap(Tensor self) -> int"; + static int64_t call(const at::Tensor & self); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_dirichlet_grad_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_dirichlet_grad_native.h new file mode 100644 index 0000000000000000000000000000000000000000..814b3bdc81eddc916e930d6baed8e3a4addd0fba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_dirichlet_grad_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _dirichlet_grad_out(const at::Tensor & x, const at::Tensor & alpha, const at::Tensor & total, at::Tensor & out); +TORCH_API at::Tensor _dirichlet_grad_cpu(const at::Tensor & x, const at::Tensor & alpha, const at::Tensor & total); +TORCH_API at::Tensor _dirichlet_grad_cuda(const at::Tensor & x, const at::Tensor & alpha, const at::Tensor & total); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_efficient_attention_forward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_efficient_attention_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8b9786361eb2dd0695526fb5e44932f40d00d782 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_efficient_attention_forward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _efficient_attention_forward(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & bias, const ::std::optional & cu_seqlens_q, const ::std::optional & cu_seqlens_k, ::std::optional max_seqlen_q, ::std::optional max_seqlen_k, double dropout_p, int64_t custom_mask_type, bool compute_log_sumexp=false, ::std::optional scale=::std::nullopt, const ::std::optional & seqlen_k={}, ::std::optional window_size=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_efficientzerotensor_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_efficientzerotensor_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..38a3603d686962edc1f87b308c87edbe974a5661 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_efficientzerotensor_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _efficientzerotensor(at::IntArrayRef size, at::TensorOptions options={}); +TORCH_API at::Tensor _efficientzerotensor(at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor _efficientzerotensor_symint(c10::SymIntArrayRef size, at::TensorOptions options={}); +TORCH_API at::Tensor _efficientzerotensor_symint(c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..40c48c22253136e3e88e5f85ee3814c25321e936 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_embedding_bag_per_sample_weights_backward(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1) -> Tensor +inline at::Tensor _embedding_bag_per_sample_weights_backward(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_per_sample_weights_backward::call(grad, weight, indices, offsets, offset2bag, mode, padding_idx); +} + +// aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_per_sample_weights_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1) { + return at::_ops::_embedding_bag_per_sample_weights_backward_out::call(grad, weight, indices, offsets, offset2bag, mode, padding_idx, out); +} +// aten::_embedding_bag_per_sample_weights_backward.out(Tensor grad, Tensor weight, Tensor indices, Tensor offsets, Tensor offset2bag, int mode, int padding_idx=-1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _embedding_bag_per_sample_weights_backward_outf(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out) { + return at::_ops::_embedding_bag_per_sample_weights_backward_out::call(grad, weight, indices, offsets, offset2bag, mode, padding_idx, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..875564b98801ad2e644dcaf67ed368c73c33e120 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_per_sample_weights_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _embedding_bag_per_sample_weights_backward_out(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx, at::Tensor & out); +TORCH_API at::Tensor _embedding_bag_per_sample_weights_backward_cpu(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1); +TORCH_API at::Tensor _embedding_bag_per_sample_weights_backward_cuda(const at::Tensor & grad, const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, int64_t mode, int64_t padding_idx=-1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1ca17ea9befdd479bd66dea863bc488aa9e26fa0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1); +TORCH_API at::Tensor _embedding_bag_sparse_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..58d3d3f6025067f8b56c4d6ac265e958ced10e5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _embedding_bag_sparse_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const ::std::optional & per_sample_weights, int64_t padding_idx=-1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_affine_quantized_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_affine_quantized_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..a68afff8bd1c495d4e3f9ae82ad62ad4ab40d2b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_affine_quantized_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _empty_affine_quantized { + using schema = at::Tensor (c10::SymIntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional, double, int64_t, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_empty_affine_quantized"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_empty_affine_quantized(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor"; + static at::Tensor call(c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, double scale, int64_t zero_point, ::std::optional memory_format); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, double scale, int64_t zero_point, ::std::optional memory_format); +}; + +struct TORCH_API _empty_affine_quantized_out { + using schema = at::Tensor & (c10::SymIntArrayRef, double, int64_t, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_empty_affine_quantized"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_empty_affine_quantized.out(SymInt[] size, *, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(c10::SymIntArrayRef size, double scale, int64_t zero_point, ::std::optional memory_format, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, double scale, int64_t zero_point, ::std::optional memory_format, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..038327f183f308a3fc1781e5006d802c2c1966c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous); +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_outf(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_symint_out(at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous); +TORCH_API at::Tensor & _empty_per_channel_affine_quantized_symint_outf(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..81f7ff920c55ea5edd6edb2989515be6a8b9bf57 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _fake_quantize_learnable_per_channel_affine_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0); +TORCH_API at::Tensor & _fake_quantize_learnable_per_channel_affine_outf(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..ec64714d6b487a1d82ef7156a029968e7e990868 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_fake_quantize_learnable_per_tensor_affine_backward(Tensor grad, Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> (Tensor, Tensor, Tensor) +inline ::std::tuple _fake_quantize_learnable_per_tensor_affine_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) { + return at::_ops::_fake_quantize_learnable_per_tensor_affine_backward::call(grad, self, scale, zero_point, quant_min, quant_max, grad_factor); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6806a586c44e09b6e61773731c147f839c036cb9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _fake_quantize_learnable_per_tensor_affine_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5e39605cb4cc6e62ba9e16866a62429f72eed1e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _fake_quantize_learnable_per_tensor_affine_out(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out); +TORCH_API at::Tensor _fake_quantize_learnable_per_tensor_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2824af81e46863d4d3529011f5ed20ab9bc8a603 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f22d632677fa3f0d81525698dd125f803c7b859a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _flash_attention_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, c10::SymInt, c10::SymInt, double, bool, const at::Tensor &, const at::Tensor &, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_flash_attention_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor rng_state, Tensor unused, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale, ::std::optional window_size_left, ::std::optional window_size_right); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, const at::Tensor & rng_state, const at::Tensor & unused, ::std::optional scale, ::std::optional window_size_left, ::std::optional window_size_right); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_forward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_forward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4f62a900123112922c4c4437bf17f15ff7f06428 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_flash_attention_forward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _flash_attention_forward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, const ::std::optional &, c10::SymInt, c10::SymInt, double, bool, bool, ::std::optional, ::std::optional, ::std::optional, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_flash_attention_forward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None, Tensor? seqused_k=None, Tensor? alibi_slopes=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor rng_state, Tensor unused, Tensor debug_attn_mask)"; + static ::std::tuple call(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & cum_seq_q, const ::std::optional & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, bool return_debug_mask, ::std::optional scale, ::std::optional window_size_left, ::std::optional window_size_right, const ::std::optional & seqused_k, const ::std::optional & alibi_slopes); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & cum_seq_q, const ::std::optional & cum_seq_k, c10::SymInt max_q, c10::SymInt max_k, double dropout_p, bool is_causal, bool return_debug_mask, ::std::optional scale, ::std::optional window_size_left, ::std::optional window_size_right, const ::std::optional & seqused_k, const ::std::optional & alibi_slopes); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foobar_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foobar_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3ca8f0c1739f3c39cff2c3f4387bed98c589faf5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foobar_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _foobar_out(at::Tensor & out, const at::Tensor & self, bool arg1=true, bool arg2=true, bool arg3=true); +TORCH_API at::Tensor & _foobar_outf(const at::Tensor & self, bool arg1, bool arg2, bool arg3, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs.h new file mode 100644 index 0000000000000000000000000000000000000000..47400324a259c4ab57e70eef89bb29abfbb2202c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_abs(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_abs(at::TensorList self) { + return at::_ops::_foreach_abs::call(self); +} + +// aten::_foreach_abs_(Tensor(a!)[] self) -> () +inline void _foreach_abs_(at::TensorList self) { + return at::_ops::_foreach_abs_::call(self); +} + +// aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_abs_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_abs_out::call(self, out); +} +// aten::_foreach_abs.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_abs_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_abs_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ffd8acfb086f8b3ddec204204ad8a543d8cde58f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_abs_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_abs(at::TensorList self); +TORCH_API void _foreach_abs_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f7376cf29c88bf5f075b938c284e581e4855f778 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_atan_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_atan(at::TensorList self); +TORCH_API void _foreach_atan_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_atan_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_atan_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f2f397c780b26b4ac9ec09ae32bc993c67d29967 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_max_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_clamp_max(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_clamp_max_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector _foreach_clamp_max(at::TensorList self, at::TensorList other); +TORCH_API void _foreach_clamp_max_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector _foreach_clamp_max(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_clamp_max_(at::TensorList self, at::ArrayRef scalars); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9ceec46773fc6793695cc65168ec06732324a531 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_clamp_min_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, at::TensorList other); +TORCH_API void _foreach_clamp_min_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_clamp_min_(at::TensorList self, at::ArrayRef scalars); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_div_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_div_native.h new file mode 100644 index 0000000000000000000000000000000000000000..da38d2765fd2e259f584ffa2ff6482b1734cf135 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_div_native.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector foreach_tensor_div_scalar_kernel_slow(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_div_Scalar_out(at::TensorList self, const at::Scalar & scalar, at::TensorList out); +TORCH_API void foreach_tensor_div_scalar_kernel_slow_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector foreach_tensor_div_scalar_kernel_cuda(at::TensorList self, const at::Scalar & scalar); +TORCH_API void foreach_tensor_div_scalar_kernel_cuda_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector foreach_tensor_div_list_kernel_slow(at::TensorList self, at::TensorList other); +TORCH_API void _foreach_div_List_out(at::TensorList self, at::TensorList other, at::TensorList out); +TORCH_API void foreach_tensor_div_list_kernel_slow_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector foreach_tensor_div_list_kernel_cuda(at::TensorList self, at::TensorList other); +TORCH_API void foreach_tensor_div_list_kernel_cuda_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector foreach_tensor_div_scalarlist_kernel_slow(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_div_ScalarList_out(at::TensorList self, at::ArrayRef scalars, at::TensorList out); +TORCH_API void foreach_tensor_div_scalarlist_kernel_slow_(at::TensorList self, at::ArrayRef scalars); +TORCH_API ::std::vector foreach_tensor_div_scalarlist_kernel_cuda(at::TensorList self, at::ArrayRef scalars); +TORCH_API void foreach_tensor_div_scalarlist_kernel_cuda_(at::TensorList self, at::ArrayRef scalars); +TORCH_API ::std::vector foreach_tensor_div_tensor_kernel_slow(at::TensorList self, const at::Tensor & other); +TORCH_API void _foreach_div_Tensor_out(at::TensorList self, const at::Tensor & other, at::TensorList out); +TORCH_API void foreach_tensor_div_tensor_kernel_slow_(at::TensorList self, const at::Tensor & other); +TORCH_API ::std::vector foreach_tensor_div_tensor_kernel_cuda(at::TensorList self, const at::Tensor & other); +TORCH_API void foreach_tensor_div_tensor_kernel_cuda_(at::TensorList self, const at::Tensor & other); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_erfc.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_erfc.h new file mode 100644 index 0000000000000000000000000000000000000000..e08c1416b3ac471d499745939e79a4956aeee78f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_erfc.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_erfc(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_erfc(at::TensorList self) { + return at::_ops::_foreach_erfc::call(self); +} + +// aten::_foreach_erfc_(Tensor(a!)[] self) -> () +inline void _foreach_erfc_(at::TensorList self) { + return at::_ops::_foreach_erfc_::call(self); +} + +// aten::_foreach_erfc.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_erfc_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_erfc_out::call(self, out); +} +// aten::_foreach_erfc.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_erfc_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_erfc_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_erfc_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_erfc_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2da4ff5b3194282aba2e8dad36f39cd93c298fe9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_erfc_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_erfc { + using schema = ::std::vector (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_erfc"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_foreach_erfc(Tensor[] self) -> Tensor[]"; + static ::std::vector call(at::TensorList self); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_erfc_ { + using schema = void (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_erfc_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_foreach_erfc_(Tensor(a!)[] self) -> ()"; + static void call(at::TensorList self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_erfc_out { + using schema = void (at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_erfc"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_foreach_erfc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c4c652d7de84f87e966eeaff9c053da31b1b95c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_native.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector foreach_tensor_ternary_lerp_slow(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API void _foreach_lerp_List_out(at::TensorList self, at::TensorList tensors1, at::TensorList weights, at::TensorList out); +TORCH_API void foreach_tensor_ternary_lerp_slow_(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API ::std::vector foreach_tensor_lerp_ternary_cuda(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API void foreach_tensor_lerp_ternary_cuda_(at::TensorList self, at::TensorList tensors1, at::TensorList weights); +TORCH_API ::std::vector foreach_tensor_lerp_list_kernel_slow(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API void _foreach_lerp_Scalar_out(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight, at::TensorList out); +TORCH_API void foreach_tensor_lerp_list_kernel_slow_(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API ::std::vector foreach_tensor_lerp_list_cuda(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API void foreach_tensor_lerp_list_cuda_(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +TORCH_API ::std::vector foreach_tensor_lerp_scalarlist_kernel_slow(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +TORCH_API void _foreach_lerp_ScalarList_out(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight, at::TensorList out); +TORCH_API void foreach_tensor_lerp_scalarlist_kernel_slow_(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +TORCH_API ::std::vector foreach_tensor_lerp_scalarlist_cuda(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +TORCH_API void foreach_tensor_lerp_scalarlist_cuda_(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..6b5ef0aec636dc312ab444db84db4ba28737abf8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lerp_ops.h @@ -0,0 +1,122 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_lerp_List { + using schema = ::std::vector (at::TensorList, at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp"; + static constexpr const char* overload_name = "List"; + static constexpr const char* schema_str = "_foreach_lerp.List(Tensor[] self, Tensor[] tensors1, Tensor[] weights) -> Tensor[]"; + static ::std::vector call(at::TensorList self, at::TensorList tensors1, at::TensorList weights); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::TensorList weights); +}; + +struct TORCH_API _foreach_lerp__List { + using schema = void (at::TensorList, at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp_"; + static constexpr const char* overload_name = "List"; + static constexpr const char* schema_str = "_foreach_lerp_.List(Tensor(a!)[] self, Tensor[] tensors1, Tensor[] weights) -> ()"; + static void call(at::TensorList self, at::TensorList tensors1, at::TensorList weights); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::TensorList weights); +}; + +struct TORCH_API _foreach_lerp_Scalar { + using schema = ::std::vector (at::TensorList, at::TensorList, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "_foreach_lerp.Scalar(Tensor[] self, Tensor[] tensors1, Scalar weight) -> Tensor[]"; + static ::std::vector call(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +}; + +struct TORCH_API _foreach_lerp__Scalar { + using schema = void (at::TensorList, at::TensorList, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "_foreach_lerp_.Scalar(Tensor(a!)[] self, Tensor[] tensors1, Scalar weight) -> ()"; + static void call(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight); +}; + +struct TORCH_API _foreach_lerp_ScalarList { + using schema = ::std::vector (at::TensorList, at::TensorList, at::ArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp"; + static constexpr const char* overload_name = "ScalarList"; + static constexpr const char* schema_str = "_foreach_lerp.ScalarList(Tensor[] self, Tensor[] tensors1, Scalar[] weight) -> Tensor[]"; + static ::std::vector call(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +}; + +struct TORCH_API _foreach_lerp__ScalarList { + using schema = void (at::TensorList, at::TensorList, at::ArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp_"; + static constexpr const char* overload_name = "ScalarList"; + static constexpr const char* schema_str = "_foreach_lerp_.ScalarList(Tensor(a!)[] self, Tensor[] tensors1, Scalar[] weight) -> ()"; + static void call(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::ArrayRef weight); +}; + +struct TORCH_API _foreach_lerp_List_out { + using schema = void (at::TensorList, at::TensorList, at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp"; + static constexpr const char* overload_name = "List_out"; + static constexpr const char* schema_str = "_foreach_lerp.List_out(Tensor[] self, Tensor[] tensors1, Tensor[] weights, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList tensors1, at::TensorList weights, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::TensorList weights, at::TensorList out); +}; + +struct TORCH_API _foreach_lerp_Scalar_out { + using schema = void (at::TensorList, at::TensorList, const at::Scalar &, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "_foreach_lerp.Scalar_out(Tensor[] self, Tensor[] tensors1, Scalar weight, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList tensors1, const at::Scalar & weight, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, const at::Scalar & weight, at::TensorList out); +}; + +struct TORCH_API _foreach_lerp_ScalarList_out { + using schema = void (at::TensorList, at::TensorList, at::ArrayRef, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lerp"; + static constexpr const char* overload_name = "ScalarList_out"; + static constexpr const char* schema_str = "_foreach_lerp.ScalarList_out(Tensor[] self, Tensor[] tensors1, Scalar[] weight, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList tensors1, at::ArrayRef weight, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList tensors1, at::ArrayRef weight, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lgamma_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lgamma_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2294cf599f2c68cc36c811e0d67fd5603352a848 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lgamma_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector foreach_tensor_lgamma_slow(at::TensorList self); +TORCH_API void _foreach_lgamma_out(at::TensorList self, at::TensorList out); +TORCH_API void foreach_tensor_lgamma_slow_(at::TensorList self); +TORCH_API ::std::vector foreach_tensor_lgamma_cuda(at::TensorList self); +TORCH_API void foreach_tensor_lgamma_cuda_(at::TensorList self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lgamma_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lgamma_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ff8d0e67a0cad09a3cf4d68858674c29e179a2de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_lgamma_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_lgamma { + using schema = ::std::vector (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lgamma"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_foreach_lgamma(Tensor[] self) -> Tensor[]"; + static ::std::vector call(at::TensorList self); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_lgamma_ { + using schema = void (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lgamma_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_foreach_lgamma_(Tensor(a!)[] self) -> ()"; + static void call(at::TensorList self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_lgamma_out { + using schema = void (at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_lgamma"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_foreach_lgamma.out(Tensor[] self, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log2.h new file mode 100644 index 0000000000000000000000000000000000000000..6d40d8d6b0ec450bfdcf84c6481a390e0c23e681 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log2.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_log2(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_log2(at::TensorList self) { + return at::_ops::_foreach_log2::call(self); +} + +// aten::_foreach_log2_(Tensor(a!)[] self) -> () +inline void _foreach_log2_(at::TensorList self) { + return at::_ops::_foreach_log2_::call(self); +} + +// aten::_foreach_log2.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_log2_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_log2_out::call(self, out); +} +// aten::_foreach_log2.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_log2_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_log2_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e407a37dfb1ce1d5900503b286b1626016399456 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_log(at::TensorList self); +TORCH_API void _foreach_log_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f05148dd04d8cb3798944f707984d0829d281ec6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_log_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_log { + using schema = ::std::vector (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_log"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_foreach_log(Tensor[] self) -> Tensor[]"; + static ::std::vector call(at::TensorList self); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_log_ { + using schema = void (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_log_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_foreach_log_(Tensor(a!)[] self) -> ()"; + static void call(at::TensorList self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_log_out { + using schema = void (at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_foreach_log"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_norm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..fb8d4e00acd7fadc8f66e60ce2cb64bbe1d1b6c9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_norm.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_norm.Scalar(Tensor[] self, Scalar ord=2, ScalarType? dtype=None) -> Tensor[] +inline ::std::vector _foreach_norm(at::TensorList self, const at::Scalar & ord=2, ::std::optional dtype=::std::nullopt) { + return at::_ops::_foreach_norm_Scalar::call(self, ord, dtype); +} + +// aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, ScalarType? dtype=None, *, Tensor(a!)[] out) -> () +inline void _foreach_norm_out(at::TensorList out, at::TensorList self, const at::Scalar & ord=2, ::std::optional dtype=::std::nullopt) { + return at::_ops::_foreach_norm_Scalar_out::call(self, ord, dtype, out); +} +// aten::_foreach_norm.Scalar_out(Tensor[] self, Scalar ord=2, ScalarType? dtype=None, *, Tensor(a!)[] out) -> () +inline void _foreach_norm_outf(at::TensorList self, const at::Scalar & ord, ::std::optional dtype, at::TensorList out) { + return at::_ops::_foreach_norm_Scalar_out::call(self, ord, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal.h new file mode 100644 index 0000000000000000000000000000000000000000..f3aaa6d2ee628419a5a51099d8e8ff6dc1351bc6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_reciprocal(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_reciprocal(at::TensorList self) { + return at::_ops::_foreach_reciprocal::call(self); +} + +// aten::_foreach_reciprocal_(Tensor(a!)[] self) -> () +inline void _foreach_reciprocal_(at::TensorList self) { + return at::_ops::_foreach_reciprocal_::call(self); +} + +// aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_reciprocal_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_reciprocal_out::call(self, out); +} +// aten::_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_reciprocal_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_reciprocal_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ab7cc3bf5f08125c5d4fa99b674f9edd57295a28 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_reciprocal(at::TensorList self); +TORCH_API void _foreach_reciprocal_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_reciprocal_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_reciprocal_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..09c9336e306177f39b7a2f7e0fdf21c362704502 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_reciprocal_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_reciprocal(at::TensorList self); +TORCH_API void _foreach_reciprocal_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..21b5a2e0a8a88061e354cc3be3c35652411ad506 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sigmoid_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_sigmoid(at::TensorList self); +TORCH_API void _foreach_sigmoid_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_sigmoid_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_sigmoid_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cec9f2125109bc25555fb550dde41f1ae0c4c337 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_sqrt_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_sqrt(at::TensorList self); +TORCH_API void _foreach_sqrt_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_sqrt_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_sqrt_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_tanh.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_tanh.h new file mode 100644 index 0000000000000000000000000000000000000000..64d39aea5ceae758ba9a7807c933f164f6e0ca23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_tanh.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_foreach_tanh(Tensor[] self) -> Tensor[] +inline ::std::vector _foreach_tanh(at::TensorList self) { + return at::_ops::_foreach_tanh::call(self); +} + +// aten::_foreach_tanh_(Tensor(a!)[] self) -> () +inline void _foreach_tanh_(at::TensorList self) { + return at::_ops::_foreach_tanh_::call(self); +} + +// aten::_foreach_tanh.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_tanh_out(at::TensorList out, at::TensorList self) { + return at::_ops::_foreach_tanh_out::call(self, out); +} +// aten::_foreach_tanh.out(Tensor[] self, *, Tensor(a!)[] out) -> () +inline void _foreach_tanh_outf(at::TensorList self, at::TensorList out) { + return at::_ops::_foreach_tanh_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9be73e5e4af26673687433475ff246c2bea037c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::vector _foreach_zero(at::TensorList self); +TORCH_API void _foreach_zero_out(at::TensorList out, at::TensorList self); +TORCH_API void _foreach_zero_outf(at::TensorList self, at::TensorList out); +TORCH_API void _foreach_zero_(at::TensorList self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce408fecc0a3972315119f2a338ce6a4a447f304 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_foreach_zero_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API void _foreach_zero_(at::TensorList self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_assert_scalar_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_assert_scalar_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b043faee8618cd4873bbf2dec283d7a6e2322da3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_assert_scalar_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _functional_assert_scalar(const at::Scalar & self, c10::string_view assert_msg, const at::Tensor & dep_token); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size.h new file mode 100644 index 0000000000000000000000000000000000000000..7b3749c7bec7fef256fbff4d2cc972e6c4458dbe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_for_size.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_functional_sym_constrain_range_for_size(Scalar size, int? min, int? max, Tensor dep_token) -> Tensor +inline at::Tensor _functional_sym_constrain_range_for_size(const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token) { + return at::_ops::_functional_sym_constrain_range_for_size::call(size, min, max, dep_token); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1cd10660ced37c9da28bb9a564ef3ce5f08a9105 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_functional_sym_constrain_range_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _functional_sym_constrain_range(const at::Scalar & size, ::std::optional min, ::std::optional max, const at::Tensor & dep_token); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9ec0f048e8e2592caf970d4c5b5f1f645a0dbe4e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple<::std::vector,::std::vector,::std::vector,::std::vector> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}); +TORCH_API void _fused_adagrad_out(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out); +TORCH_API void _fused_adagrad_kernel_cpu_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}); +TORCH_API void _fused_adagrad_kernel_cuda_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}); +TORCH_API ::std::tuple<::std::vector,::std::vector,::std::vector> _fused_adagrad(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}); +TORCH_API void _fused_adagrad_tensor_lr_out(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out); +TORCH_API void _fused_adagrad_kernel_cpu_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}); +TORCH_API void _fused_adagrad_kernel_cuda_(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale={}, const ::std::optional & found_inf={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c405f213414bac9f4314d7e0e135bd7cb1715841 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_fused_adagrad_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _fused_adagrad_ { + using schema = void (at::TensorList, at::TensorList, at::TensorList, at::TensorList, double, double, double, double, bool, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fused_adagrad_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()"; + static void call(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); +}; + +struct TORCH_API _fused_adagrad__tensor_lr { + using schema = void (at::TensorList, at::TensorList, at::TensorList, at::TensorList, const at::Tensor &, double, double, double, bool, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fused_adagrad_"; + static constexpr const char* overload_name = "tensor_lr"; + static constexpr const char* schema_str = "_fused_adagrad_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()"; + static void call(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); +}; + +struct TORCH_API _fused_adagrad_out { + using schema = void (at::TensorList, at::TensorList, at::TensorList, at::TensorList, double, double, double, double, bool, const ::std::optional &, const ::std::optional &, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fused_adagrad"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_fused_adagrad.out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out); +}; + +struct TORCH_API _fused_adagrad { + using schema = ::std::tuple<::std::vector,::std::vector,::std::vector,::std::vector> (at::TensorList, at::TensorList, at::TensorList, at::TensorList, double, double, double, double, bool, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fused_adagrad"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_fused_adagrad(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out, Tensor[] state_steps_out)"; + static ::std::tuple<::std::vector,::std::vector,::std::vector,::std::vector> call(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); + static ::std::tuple<::std::vector,::std::vector,::std::vector,::std::vector> redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, double lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); +}; + +struct TORCH_API _fused_adagrad_tensor_lr_out { + using schema = void (at::TensorList, at::TensorList, at::TensorList, at::TensorList, const at::Tensor &, double, double, double, bool, const ::std::optional &, const ::std::optional &, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fused_adagrad"; + static constexpr const char* overload_name = "tensor_lr_out"; + static constexpr const char* schema_str = "_fused_adagrad.tensor_lr_out(Tensor[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf, at::TensorList out); +}; + +struct TORCH_API _fused_adagrad_tensor_lr { + using schema = ::std::tuple<::std::vector,::std::vector,::std::vector> (at::TensorList, at::TensorList, at::TensorList, at::TensorList, const at::Tensor &, double, double, double, bool, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_fused_adagrad"; + static constexpr const char* overload_name = "tensor_lr"; + static constexpr const char* schema_str = "_fused_adagrad.tensor_lr(Tensor[] self, Tensor[] grads, Tensor[] state_sums, Tensor[] state_steps, *, Tensor lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> (Tensor[] self_out, Tensor[] grads_out, Tensor[] state_sums_out)"; + static ::std::tuple<::std::vector,::std::vector,::std::vector> call(at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); + static ::std::tuple<::std::vector,::std::vector,::std::vector> redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList grads, at::TensorList state_sums, at::TensorList state_steps, const at::Tensor & lr, double lr_decay, double weight_decay, double eps, bool maximize, const ::std::optional & grad_scale, const ::std::optional & found_inf); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_grid_sampler_2d_cpu_fallback_backward_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_grid_sampler_2d_cpu_fallback_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..96144cae319b0bede01ea0dbebc38fb9dd428cae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_grid_sampler_2d_cpu_fallback_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple _grid_sampler_2d_cpu_fallback_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_has_compatible_shallow_copy_type_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_has_compatible_shallow_copy_type_native.h new file mode 100644 index 0000000000000000000000000000000000000000..de7e07fff6da10fac02156c9a008a3225c2c5d53 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_has_compatible_shallow_copy_type_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API bool _has_compatible_shallow_copy_type(const at::Tensor & self, const at::Tensor & from); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3ebbd414f8ae5436b41d69a36fa021bc9d7b5993 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_bin_edges_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::vector _histogramdd_bin_edges(const at::Tensor & self, at::IntArrayRef bins, ::std::optional> range=::std::nullopt, const ::std::optional & weight={}, bool density=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_from_bin_tensors_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_from_bin_tensors_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8010026d1c3ca0ba546f33fbb6f78f1e2559eccc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_histogramdd_from_bin_tensors_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _histogramdd_from_bin_tensors { + using schema = at::Tensor (const at::Tensor &, at::TensorList, const ::std::optional &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_histogramdd_from_bin_tensors"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_histogramdd_from_bin_tensors(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::TensorList bins, const ::std::optional & weight, bool density); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList bins, const ::std::optional & weight, bool density); +}; + +struct TORCH_API _histogramdd_from_bin_tensors_out { + using schema = at::Tensor & (const at::Tensor &, at::TensorList, const ::std::optional &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_histogramdd_from_bin_tensors"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_histogramdd_from_bin_tensors.out(Tensor self, Tensor[] bins, *, Tensor? weight=None, bool density=False, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::TensorList bins, const ::std::optional & weight, bool density, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::TensorList bins, const ::std::optional & weight, bool density, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..57f52265eb554b090f67af0e7a86be3a4974bf55 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_index_put_impl_meta_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor & _index_put_impl_(at::Tensor & self, const c10::List<::std::optional> & indices, const at::Tensor & values, bool accumulate=false, bool unsafe=false); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_int_mm_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_int_mm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..66dd22fe218e5d4ca448978bd637520b13251e75 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_int_mm_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _int_mm_cpu(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & _int_mm_out_cpu(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +TORCH_API at::Tensor _int_mm_cuda(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & _int_mm_out_cuda(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_is_any_true_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_is_any_true_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3f970116420699adb1c87228ac513c1e23f15719 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_is_any_true_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _is_any_true(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_is_zerotensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_is_zerotensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b6eb678580de542cb8c3044bc1faea4ba27bd9ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_is_zerotensor_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API bool _is_zerotensor(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e436589d498f5bc79ba0fa255db2fc75afcc60db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _jagged_to_padded_dense_forward_cpu(const at::Tensor & values, at::TensorList offsets, at::IntArrayRef max_lengths, double padding_value=0.0); +TORCH_API at::Tensor _fbgemm_jagged_to_padded_dense_forward(const at::Tensor & values, at::TensorList offsets, at::IntArrayRef max_lengths, double padding_value=0.0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1b9b16ac7a0f8533b1eea5fd9750322afe5120c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_jagged_to_padded_dense_forward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _jagged_to_padded_dense_forward { + using schema = at::Tensor (const at::Tensor &, at::TensorList, c10::SymIntArrayRef, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_jagged_to_padded_dense_forward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_jagged_to_padded_dense_forward(Tensor values, Tensor[] offsets, SymInt[] max_lengths, float padding_value=0.0) -> Tensor"; + static at::Tensor call(const at::Tensor & values, at::TensorList offsets, c10::SymIntArrayRef max_lengths, double padding_value); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & values, at::TensorList offsets, c10::SymIntArrayRef max_lengths, double padding_value); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_native.h new file mode 100644 index 0000000000000000000000000000000000000000..abb51ddbbb78e4995d4fd4dfc06ba37007f281d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lazy_clone_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _lazy_clone(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_eigh_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_eigh_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..27fccf84ec8ae28217d61b44fd8636b6842e6b13 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_eigh_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple _linalg_eigh(const at::Tensor & A, c10::string_view UPLO="L", bool compute_v=true); +TORCH_API ::std::tuple _linalg_eigh_out(at::Tensor & eigenvalues, at::Tensor & eigenvectors, const at::Tensor & A, c10::string_view UPLO="L", bool compute_v=true); +TORCH_API ::std::tuple _linalg_eigh_outf(const at::Tensor & A, c10::string_view UPLO, bool compute_v, at::Tensor & eigenvalues, at::Tensor & eigenvectors); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_solve_ex.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_solve_ex.h new file mode 100644 index 0000000000000000000000000000000000000000..2bb785c5086759c5900f936dc4e22c08336e59c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_solve_ex.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor LU, Tensor pivots, Tensor info) +inline ::std::tuple _linalg_solve_ex(const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) { + return at::_ops::_linalg_solve_ex::call(A, B, left, check_errors); +} + +// aten::_linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) +inline ::std::tuple _linalg_solve_ex_out(at::Tensor & result, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) { + return at::_ops::_linalg_solve_ex_result::call(A, B, left, check_errors, result, LU, pivots, info); +} +// aten::_linalg_solve_ex.result(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) -> (Tensor(a!) result, Tensor(b!) LU, Tensor(c!) pivots, Tensor(d!) info) +inline ::std::tuple _linalg_solve_ex_outf(const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info) { + return at::_ops::_linalg_solve_ex_result::call(A, B, left, check_errors, result, LU, pivots, info); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b95a407b401d0ee146036e4d4aa7ab71f2598a40 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API ::std::tuple _linalg_svd(const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, ::std::optional driver=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..11790b6389fa75823969ddd95aa03cd319dd91c5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__linalg_svd : public at::impl::MetaBase { + + + void meta(const at::Tensor & A, bool full_matrices, bool compute_uv, ::std::optional driver); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c480fc7c69d6ab0b3bef48042478d96182ce78a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_linalg_svd_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _linalg_svd { + using schema = ::std::tuple (const at::Tensor &, bool, bool, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_linalg_svd"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)"; + static ::std::tuple call(const at::Tensor & A, bool full_matrices, bool compute_uv, ::std::optional driver); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool full_matrices, bool compute_uv, ::std::optional driver); +}; + +struct TORCH_API _linalg_svd_U { + using schema = ::std::tuple (const at::Tensor &, bool, bool, ::std::optional, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_linalg_svd"; + static constexpr const char* overload_name = "U"; + static constexpr const char* schema_str = "_linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)"; + static ::std::tuple call(const at::Tensor & A, bool full_matrices, bool compute_uv, ::std::optional driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool full_matrices, bool compute_uv, ::std::optional driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense.h new file mode 100644 index 0000000000000000000000000000000000000000..372cf08e89f646a5c7b925bf5c9d549737305aef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_local_scalar_dense(Tensor self) -> Scalar +inline at::Scalar _local_scalar_dense(const at::Tensor & self) { + return at::_ops::_local_scalar_dense::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..88014d1a605d41b814365f382df79ae497fcadfb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_local_scalar_dense_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Scalar _local_scalar_dense(const at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8ef313606714305d8aa67783ee0bb55104dfa024 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor _log_softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype); +TORCH_API at::Tensor & _log_softmax_backward_data_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype); +TORCH_API at::Tensor & _log_softmax_backward_data_outf(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b0eaa1052e385ffef1c5b41c66d534911996bbb3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_logcumsumexp_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _logcumsumexp_cpu(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_out_cpu(const at::Tensor & self, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor _logcumsumexp_cuda(const at::Tensor & self, int64_t dim); +TORCH_API at::Tensor & _logcumsumexp_out_cuda(const at::Tensor & self, int64_t dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lstm_mps.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lstm_mps.h new file mode 100644 index 0000000000000000000000000000000000000000..fca80d725ad1264af48467fa58af499bf6ebf16c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lstm_mps.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) +inline ::std::tuple _lstm_mps(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::_lstm_mps::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!)) +inline ::std::tuple _lstm_mps_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::_lstm_mps_out::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5); +} +// aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!)) +inline ::std::tuple _lstm_mps_outf(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5) { + return at::_ops::_lstm_mps_out::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lu_with_info.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lu_with_info.h new file mode 100644 index 0000000000000000000000000000000000000000..907ff3f14c12147356a55dad34809c865d4f0e47 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_lu_with_info.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_lu_with_info(Tensor self, bool pivot=True, bool check_errors=True) -> (Tensor LU, Tensor pivots, Tensor info) +inline ::std::tuple _lu_with_info(const at::Tensor & self, bool pivot=true, bool check_errors=true) { + return at::_ops::_lu_with_info::call(self, pivot, check_errors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_dep_token_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_dep_token_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9cd4c4197ffb52c56f4d23e5850a592f99007efc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_dep_token_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _make_dep_token { + using schema = at::Tensor (::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_make_dep_token"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_make_dep_token(*, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor"; + static at::Tensor call(::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_dual_copy_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_dual_copy_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8742c7a8e4d200226fc622deca2f8ecc2a6f9997 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_dual_copy_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor _make_dual_copy(const at::Tensor & primal, const at::Tensor & tangent, int64_t level); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..24104a392cabc0b0aa865a0c133b36ae4c495d11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _make_per_channel_quantized_tensor(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cde2f592ddee6a3e1d2606378f6d97558694d787 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _make_per_tensor_quantized_tensor_out(at::Tensor & out, const at::Tensor & self, double scale, int64_t zero_point); +TORCH_API at::Tensor & _make_per_tensor_quantized_tensor_outf(const at::Tensor & self, double scale, int64_t zero_point, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_scale.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_scale.h new file mode 100644 index 0000000000000000000000000000000000000000..33a51018a217325c430d33b222a25e425f0c7d65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_scale.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_masked_scale(Tensor self, Tensor mask, float scale) -> Tensor +inline at::Tensor _masked_scale(const at::Tensor & self, const at::Tensor & mask, double scale) { + return at::_ops::_masked_scale::call(self, mask, scale); +} + +// aten::_masked_scale.out(Tensor self, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _masked_scale_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, double scale) { + return at::_ops::_masked_scale_out::call(self, mask, scale, out); +} +// aten::_masked_scale.out(Tensor self, Tensor mask, float scale, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _masked_scale_outf(const at::Tensor & self, const at::Tensor & mask, double scale, at::Tensor & out) { + return at::_ops::_masked_scale_out::call(self, mask, scale, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_scale_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_scale_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b95bb9a5cda0342a5422f11278f2c4466d65b9f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_scale_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _masked_scale(const at::Tensor & self, const at::Tensor & mask, double scale); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2e73239afcb868afa2d1032289c10d9588ce0188 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _masked_softmax_backward_out(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim, at::Tensor & out); +TORCH_API at::Tensor masked_softmax_backward_cpu(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim=::std::nullopt); +TORCH_API at::Tensor masked_softmax_backward_cuda(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8ae4354935814764678de5d11afa2e89e9bb701c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _masked_softmax_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_masked_softmax_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_masked_softmax_backward(Tensor grad_output, Tensor output, Tensor mask, int? dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim); +}; + +struct TORCH_API _masked_softmax_backward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_masked_softmax_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_masked_softmax_backward.out(Tensor grad_output, Tensor output, Tensor mask, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, ::std::optional dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..59c3453ecc8d7b6facd3268e2805cbd6d84f5f71 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _masked_softmax_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, ::std::optional dim=::std::nullopt, ::std::optional mask_type=::std::nullopt); +TORCH_API at::Tensor & _masked_softmax_outf(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim, ::std::optional mask_type, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cd491a92c0f7469ce2531c8eb189c5a0cfebb9bb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _masked_softmax(const at::Tensor & self, const at::Tensor & mask, ::std::optional dim=::std::nullopt, ::std::optional mask_type=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mixed_dtypes_linear_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mixed_dtypes_linear_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c3af178fc58eddb3a9499cf9d993a4138d2b5872 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mixed_dtypes_linear_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _mixed_dtypes_linear { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_mixed_dtypes_linear"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & scale, const ::std::optional & bias, ::std::optional activation); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & scale, const ::std::optional & bias, ::std::optional activation); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b1b37e423a0fe93149ab22f83edb2c7f44b60e0a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_mps_convolution_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _mps_convolution_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups); +TORCH_API at::Tensor & _mps_convolution_outf(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::Tensor & out); +TORCH_API at::Tensor & _mps_convolution_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups); +TORCH_API at::Tensor & _mps_convolution_symint_outf(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e72f964230f3504d92c068faf21b3b6a145939cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_compute_contiguous_strides_offsets_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nested_compute_contiguous_strides_offsets { + using schema = ::std::tuple (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_compute_contiguous_strides_offsets"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & nested_size); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & nested_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_and_nested_example_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_and_nested_example_native.h new file mode 100644 index 0000000000000000000000000000000000000000..23d95a30a94052511d91c2307256c6365578e4f1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_and_nested_example_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _nested_from_padded_and_nested_example_out(const at::Tensor & padded, const at::Tensor & nt_example, at::Tensor & out); +TORCH_API at::Tensor NestedTensor_from_padded_and_nested_example(const at::Tensor & padded, const at::Tensor & nt_example); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f514d6f3ce2de208196370b85a4a000078b4a05c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _nested_from_padded(const at::Tensor & padded, const at::Tensor & cpu_nested_shape_example, bool fuse_transform_0213=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_tensor_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4de1a206d28e19524165a397566759d623b0e17f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_from_padded_tensor_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nested_from_padded_tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, const ::std::optional &, const ::std::optional &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_from_padded_tensor"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_nested_from_padded_tensor(Tensor padded, Tensor offsets, Tensor dummy, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None, SymInt? sum_S=None) -> Tensor"; + static at::Tensor call(const at::Tensor & padded, const at::Tensor & offsets, const at::Tensor & dummy, int64_t ragged_idx, const ::std::optional & min_seqlen, const ::std::optional & max_seqlen, ::std::optional sum_S); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & padded, const at::Tensor & offsets, const at::Tensor & dummy, int64_t ragged_idx, const ::std::optional & min_seqlen, const ::std::optional & max_seqlen, ::std::optional sum_S); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_max_seqlen_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_max_seqlen_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0463f07d0019e4a3a7c2e520af1d14cb76e15912 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_max_seqlen_native.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_offsets.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_offsets.h new file mode 100644 index 0000000000000000000000000000000000000000..fbd69d5f0c0802a7ff532ba9d22e804067aed483 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_offsets.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_nested_get_offsets(Tensor self) -> Tensor +inline at::Tensor _nested_get_offsets(const at::Tensor & self) { + return at::_ops::_nested_get_offsets::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_ragged_idx_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_ragged_idx_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0463f07d0019e4a3a7c2e520af1d14cb76e15912 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_ragged_idx_native.h @@ -0,0 +1,25 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_values_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_values_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..adfe0843e29a1278e95b7a25f063675b83337d51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_values_copy.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_nested_get_values_copy(Tensor self) -> Tensor +inline at::Tensor _nested_get_values_copy(const at::Tensor & self) { + return at::_ops::_nested_get_values_copy::call(self); +} + +// aten::_nested_get_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _nested_get_values_copy_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::_nested_get_values_copy_out::call(self, out); +} +// aten::_nested_get_values_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _nested_get_values_copy_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::_nested_get_values_copy_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_values_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_values_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..04db06d37df62322918aec64e3d7662b4eb35462 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_get_values_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nested_get_values { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_nested_get_values"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_nested_get_values(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_sum_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_sum_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5d2be574bd1e02ed0ffe6523087caa653dd9e14c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_sum_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _nested_sum_backward_cpu(const at::Tensor & grad, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask.h new file mode 100644 index 0000000000000000000000000000000000000000..f02af7d29f82643df1ce850045f223ae47c59df1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_nested_tensor_from_mask(Tensor t, Tensor mask, bool mask_check=True) -> Tensor +inline at::Tensor _nested_tensor_from_mask(const at::Tensor & t, const at::Tensor & mask, bool mask_check=true) { + return at::_ops::_nested_tensor_from_mask::call(t, mask, mask_check); +} + +// aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _nested_tensor_from_mask_out(at::Tensor & out, const at::Tensor & t, const at::Tensor & mask, bool mask_check=true) { + return at::_ops::_nested_tensor_from_mask_out::call(t, mask, mask_check, out); +} +// aten::_nested_tensor_from_mask.out(Tensor t, Tensor mask, bool mask_check=True, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _nested_tensor_from_mask_outf(const at::Tensor & t, const at::Tensor & mask, bool mask_check, at::Tensor & out) { + return at::_ops::_nested_tensor_from_mask_out::call(t, mask, mask_check, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ab57a85a3f8a8c61fd77638bebb3b0fcde1632d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_left_aligned_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API bool _nested_tensor_from_mask_left_aligned(const at::Tensor & t, const at::Tensor & mask); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_native.h new file mode 100644 index 0000000000000000000000000000000000000000..50c0ed13c624ed2b2cddbae6d484b517d2d9949a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _nested_view_from_buffer(const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged.h new file mode 100644 index 0000000000000000000000000000000000000000..ed68bc78198a91c18cc9b680b1038d0087ca5504 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1, Tensor? min_seqlen=None, Tensor? max_seqlen=None) -> Tensor(a) +inline at::Tensor _nested_view_from_jagged(const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const ::std::optional & lengths={}, int64_t ragged_idx=1, const ::std::optional & min_seqlen={}, const ::std::optional & max_seqlen={}) { + return at::_ops::_nested_view_from_jagged::call(self, offsets, dummy, lengths, ragged_idx, min_seqlen, max_seqlen); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4962f1e75348ba9c93c1826b997e93bd42262953 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _nested_view_from_jagged_copy_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const ::std::optional & lengths={}, int64_t ragged_idx=1, const ::std::optional & min_seqlen={}, const ::std::optional & max_seqlen={}); +TORCH_API at::Tensor & _nested_view_from_jagged_copy_outf(const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const ::std::optional & lengths, int64_t ragged_idx, const ::std::optional & min_seqlen, const ::std::optional & max_seqlen, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..eb3e312b5bd98849a21804c98e89714f8206a180 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _new_zeros_with_same_feature_meta { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_new_zeros_with_same_feature_meta"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims); +}; + +struct TORCH_API _new_zeros_with_same_feature_meta_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_new_zeros_with_same_feature_meta"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_new_zeros_with_same_feature_meta.out(Tensor self, Tensor other, *, int self_num_batch_dims=0, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nnpack_available_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nnpack_available_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..da44a324a595fe49afbcd31c355bc7f9bdc124f0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_nnpack_available_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API bool _nnpack_available(); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..223088a20dbb4589b8b5f893df87bfd8aee1fa67 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _pack_padded_sequence_backward { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_pack_padded_sequence_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_pack_padded_sequence_backward(Tensor grad, SymInt[] input_size, Tensor batch_sizes, bool batch_first) -> Tensor"; + static at::Tensor call(const at::Tensor & grad, c10::SymIntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, c10::SymIntArrayRef input_size, const at::Tensor & batch_sizes, bool batch_first); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e29638f0965e499296f18a24b0c8e080ce35bdf0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pack_padded_sequence_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _pack_padded_sequence { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_pack_padded_sequence"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_pack_padded_sequence(Tensor input, Tensor lengths, bool batch_first) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input, const at::Tensor & lengths, bool batch_first); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & lengths, bool batch_first); +}; + +struct TORCH_API _pack_padded_sequence_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_pack_padded_sequence"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_pack_padded_sequence.out(Tensor input, Tensor lengths, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & input, const at::Tensor & lengths, bool batch_first, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & lengths, bool batch_first, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular.h new file mode 100644 index 0000000000000000000000000000000000000000..cf2eb31239b6da7eb6f50a47cce6dbcb8c2791db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor +inline at::Tensor _pad_circular(const at::Tensor & self, at::IntArrayRef pad) { + return at::_ops::_pad_circular::call(self, c10::fromIntArrayRefSlow(pad)); +} +namespace symint { + template >> + at::Tensor _pad_circular(const at::Tensor & self, at::IntArrayRef pad) { + return at::_ops::_pad_circular::call(self, c10::fromIntArrayRefSlow(pad)); + } +} + +// aten::_pad_circular(Tensor self, SymInt[] pad) -> Tensor +inline at::Tensor _pad_circular_symint(const at::Tensor & self, c10::SymIntArrayRef pad) { + return at::_ops::_pad_circular::call(self, pad); +} +namespace symint { + template >> + at::Tensor _pad_circular(const at::Tensor & self, c10::SymIntArrayRef pad) { + return at::_ops::_pad_circular::call(self, pad); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular_native.h new file mode 100644 index 0000000000000000000000000000000000000000..585178effb25488c4c24136d5abcbe617f0bfb23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_circular_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _pad_circular_symint(const at::Tensor & self, c10::SymIntArrayRef pad); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_packed_sequence_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_packed_sequence_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d6c483ccd1b11e617b36a61e430ca7be667e5c0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pad_packed_sequence_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _pad_packed_sequence { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool, const at::Scalar &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_pad_packed_sequence"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_pad_packed_sequence(Tensor data, Tensor batch_sizes, bool batch_first, Scalar padding_value, int total_length) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & data, const at::Tensor & batch_sizes, bool batch_first, const at::Scalar & padding_value, int64_t total_length); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & data, const at::Tensor & batch_sizes, bool batch_first, const at::Scalar & padding_value, int64_t total_length); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0ecea6d7d1f4740ef09ee4487f026919679ad164 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _pdist_forward_out(at::Tensor & out, const at::Tensor & self, double p=2); +TORCH_API at::Tensor & _pdist_forward_outf(const at::Tensor & self, double p, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a77e26cec8cba39479bd42aa5f366531cb61840f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_pdist_forward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _pdist_forward(const at::Tensor & self, double p=2); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_prelu_kernel_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_prelu_kernel_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..78559a7fb8cfc286a203b4720c7123bd3cdc88c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_prelu_kernel_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _prelu_kernel_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight); +TORCH_API ::std::tuple mkldnn_prelu_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & weight); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..00ac62d6ed2dbe916b996f7ab025bf2dd6ce96d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_meta_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor _reshape_alias(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride); +TORCH_API at::Tensor _reshape_alias_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6982d664963fca24f019e6d92382c5fca69c4a12 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _reshape_alias(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4a5916d3707c69a5e5e2b781d49c98a656ec2fb8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_alias_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _reshape_alias { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_reshape_alias"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_reshape_alias(Tensor(a) self, SymInt[] size, SymInt[] stride) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..b8f5bd5efdc8a5d9ea0c4e3a038d57c5ab645793 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_reshape_copy.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor +inline at::Tensor _reshape_copy(const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::_reshape_copy::call(self, c10::fromIntArrayRefSlow(size)); +} +namespace symint { + template >> + at::Tensor _reshape_copy(const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::_reshape_copy::call(self, c10::fromIntArrayRefSlow(size)); + } +} + +// aten::_reshape_copy(Tensor self, SymInt[] size) -> Tensor +inline at::Tensor _reshape_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size) { + return at::_ops::_reshape_copy::call(self, size); +} +namespace symint { + template >> + at::Tensor _reshape_copy(const at::Tensor & self, c10::SymIntArrayRef size) { + return at::_ops::_reshape_copy::call(self, size); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention.h new file mode 100644 index 0000000000000000000000000000000000000000..b1a7d306565d2ff7b53c4c3c7c2ae10b21ed2871 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset) +inline ::std::tuple _scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const ::std::optional & attn_bias, bool compute_log_sumexp, double dropout_p=0.0, bool is_causal=false, ::std::optional scale=::std::nullopt) { + return at::_ops::_scaled_dot_product_efficient_attention::call(query, key, value, attn_bias, compute_log_sumexp, dropout_p, is_causal, scale); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c63734032ee74061deb153b1ddf13d78ce7cd535 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_efficient_attention_backward_cuda(const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & attn_bias, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & philox_seed, const at::Tensor & philox_offset, double dropout_p, ::std::array grad_input_mask, bool is_causal=false, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..811ad7ef3b80b8a9100c6cc0175f679fb52939af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_for_cpu_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_for_cpu(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, const ::std::optional & attn_mask={}, ::std::optional scale=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e4ff4b1170fe6159fb55c432347102b482b264ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_dot_product_flash_attention_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_cuda(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, ::std::optional scale=::std::nullopt); +TORCH_API ::std::tuple _scaled_dot_product_flash_attention_nestedtensor_cuda(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, double dropout_p=0.0, bool is_causal=false, bool return_debug_mask=false, ::std::optional scale=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_mm_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_mm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c89233cc12bb80da96d50f6a9aa8d1d37374d139 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_scaled_mm_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _scaled_mm_cpu(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias={}, const ::std::optional & scale_result={}, ::std::optional out_dtype=::std::nullopt, bool use_fast_accum=false); +TORCH_API at::Tensor & _scaled_mm_out_cpu(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias, const ::std::optional & scale_result, ::std::optional out_dtype, bool use_fast_accum, at::Tensor & out); +TORCH_API at::Tensor _scaled_mm_cuda(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias={}, const ::std::optional & scale_result={}, ::std::optional out_dtype=::std::nullopt, bool use_fast_accum=false); +TORCH_API at::Tensor & _scaled_mm_out_cuda(const at::Tensor & self, const at::Tensor & mat2, const at::Tensor & scale_a, const at::Tensor & scale_b, const ::std::optional & bias, const ::std::optional & scale_result, ::std::optional out_dtype, bool use_fast_accum, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff.h new file mode 100644 index 0000000000000000000000000000000000000000..d21839a9e526cbf3c621859ef9ff98a575bddf68 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!) +inline at::Tensor & _sobol_engine_ff_(at::Tensor & self, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated) { + return at::_ops::_sobol_engine_ff_::call(self, n, sobolstate, dimension, num_generated); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..471c3bfaa06567b2149278f567b1bbea6b44ea8d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_ff_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sobol_engine_ff_ { + using schema = at::Tensor & (at::Tensor &, int64_t, const at::Tensor &, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sobol_engine_ff_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_sobol_engine_ff_(Tensor(a!) self, int n, Tensor sobolstate, int dimension, int num_generated) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t n, const at::Tensor & sobolstate, int64_t dimension, int64_t num_generated); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state.h new file mode 100644 index 0000000000000000000000000000000000000000..b2d744d5f8a24c9dfc7b80a599dc417e9f80ef79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!) +inline at::Tensor & _sobol_engine_initialize_state_(at::Tensor & self, int64_t dimension) { + return at::_ops::_sobol_engine_initialize_state_::call(self, dimension); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..593c1e0bdbeea3a3b91d8814a0efa365c0a0ea7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _softmax(const at::Tensor & self, int64_t dim, bool half_to_float); +TORCH_API at::Tensor & _softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float); +TORCH_API at::Tensor & _softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..43d337f6870b7175a1ab17a366be95728ea57118 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_softmax_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured__softmax : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, int64_t dim, bool half_to_float); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to.h new file mode 100644 index 0000000000000000000000000000000000000000..911336e9f484aa76ff266fb1f3c0f0d197222736 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a) +inline at::Tensor _sparse_broadcast_to(const at::Tensor & self, at::IntArrayRef size) { + return at::_ops::_sparse_broadcast_to::call(self, size); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e9d198d9191643e221ec9a58a8e88d36c2879ccf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _sparse_broadcast_to_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size); +TORCH_API at::Tensor & _sparse_broadcast_to_copy_outf(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8db6652ce1728ab98aa25b3e78500fad3a88f630 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_broadcast_to_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_broadcast_to_copy { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_broadcast_to_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_sparse_broadcast_to_copy(Tensor self, int[] size) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef size); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size); +}; + +struct TORCH_API _sparse_broadcast_to_copy_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_broadcast_to_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_sparse_broadcast_to_copy.out(Tensor self, int[] size, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef size, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef size, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe.h new file mode 100644 index 0000000000000000000000000000000000000000..6397d2bf44f25c43b26de2884b8860e0dfaf4d82 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +namespace symint { + template >> + at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); + } +} + +// aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory); +} +namespace symint { + template >> + at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, c10::fromIntArrayRefSlow(size), dtype, layout, device, pin_memory); + } +} + +// aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={}) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +namespace symint { + template >> + at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={}) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); + } +} + +// aten::_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory); +} +namespace symint { + template >> + at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::_sparse_compressed_tensor_unsafe::call(compressed_indices, plain_indices, values, size, dtype, layout, device, pin_memory); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h new file mode 100644 index 0000000000000000000000000000000000000000..34faf62b50721f987d3aec78ccfcb8903226f7ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor +inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); +} +namespace symint { + template >> + at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor +inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, dtype, layout, device, pin_memory, is_coalesced); +} +namespace symint { + template >> + at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, dtype, layout, device, pin_memory, is_coalesced); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor +inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors_symint(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, size, indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); +} +namespace symint { + template >> + at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, size, indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False, bool? is_coalesced=None) -> Tensor +inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors_symint(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, size, indices, values, dtype, layout, device, pin_memory, is_coalesced); +} +namespace symint { + template >> + at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors::call(sparse_dim, dense_dim, size, indices, values, dtype, layout, device, pin_memory, is_coalesced); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_out(at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, is_coalesced, out); +} +namespace symint { + template >> + at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_out(at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, is_coalesced, out); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_outf(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced, at::Tensor & out) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, is_coalesced, out); +} +namespace symint { + template >> + at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_outf(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced, at::Tensor & out) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, c10::fromIntArrayRefSlow(size), indices, values, is_coalesced, out); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_symint_out(at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, size, indices, values, is_coalesced, out); +} +namespace symint { + template >> + at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_out(at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, size, indices, values, is_coalesced, out); + } +} + +// aten::_sparse_coo_tensor_with_dims_and_tensors.out(int sparse_dim, int dense_dim, SymInt[] size, Tensor indices, Tensor values, *, bool? is_coalesced=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_symint_outf(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced, at::Tensor & out) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, size, indices, values, is_coalesced, out); +} +namespace symint { + template >> + at::Tensor & _sparse_coo_tensor_with_dims_and_tensors_outf(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, ::std::optional is_coalesced, at::Tensor & out) { + return at::_ops::_sparse_coo_tensor_with_dims_and_tensors_out::call(sparse_dim, dense_dim, size, indices, values, is_coalesced, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..44012b176eae1299d0e6052d6b3d75b2782edcc5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims_meta_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::TensorOptions options); +TORCH_API at::Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_prod_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_prod_native.h new file mode 100644 index 0000000000000000000000000000000000000000..89a1960067ab9ec686fbb7222514f6138c6a852a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_prod_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _sparse_csr_prod_dim_dtype_out(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor _sparse_csr_prod_cpu(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor _sparse_csr_prod_cuda(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_tensor_unsafe_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_tensor_unsafe_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e7f2ab4cdb8217e321b98a308306a2e6f9b797ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_csr_tensor_unsafe_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _sparse_csr_tensor_unsafe(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..a3639ea96f2587566cbce6f2af9c52e77e510f59 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_log_softmax.h @@ -0,0 +1,55 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_sparse_log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_log_softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_log_softmax_int::call(self, dim, dtype); +} + +// aten::_sparse_log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor _sparse_log_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::_sparse_log_softmax_Dimname::call(self, dim, dtype); +} + +// aten::_sparse_log_softmax(Tensor self, int dim, bool half_to_float) -> Tensor +inline at::Tensor _sparse_log_softmax(const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_log_softmax::call(self, dim, half_to_float); +} + +// aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_log_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float) { + return at::_ops::_sparse_log_softmax_out::call(self, dim, half_to_float, out); +} +// aten::_sparse_log_softmax.out(Tensor self, int dim, bool half_to_float, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _sparse_log_softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out) { + return at::_ops::_sparse_log_softmax_out::call(self, dim, half_to_float, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mask_projection_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mask_projection_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8b1a0f1c0d10c99980327714c2b9125229965b20 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mask_projection_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _sparse_mask_projection_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches=false); +TORCH_API at::Tensor & _sparse_mask_projection_outf(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..adfb5f7d62bb4216f53851cc90f0e45c1302c341 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_mask_projection { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_mask_projection"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches); +}; + +struct TORCH_API _sparse_mask_projection_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_sparse_mask_projection"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h new file mode 100644 index 0000000000000000000000000000000000000000..930d59669a87fbcd43da5716fd3cb2f31ea67dd7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _sparse_sparse_matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor sparse_sparse_matmul_cpu(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor sparse_sparse_matmul_cuda(const at::Tensor & self, const at::Tensor & other); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sum_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sum_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0ca2cd18cae7365fd8f189075f15072b75aa8ab9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_sparse_sum_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self); +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self, at::ScalarType dtype); +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor & _sparse_sum_dim_out(const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); +TORCH_API at::Tensor _sparse_sum(const at::Tensor & self, at::IntArrayRef dim, at::ScalarType dtype); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_spdiags_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_spdiags_native.h new file mode 100644 index 0000000000000000000000000000000000000000..bfc39f9f04c96a8e725fbe3960377cfd99b1b177 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_spdiags_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _spdiags_out(const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, ::std::optional layout, at::Tensor & out); +TORCH_API at::Tensor spdiags(const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, ::std::optional layout=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_stack_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_stack_native.h new file mode 100644 index 0000000000000000000000000000000000000000..fba4c186ab9052d3309a38cbddad2dfc35b36a3d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_stack_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _stack(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & _stack_out(at::TensorList tensors, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor _stack_cpu(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & _stack_out_cpu(at::TensorList tensors, int64_t dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c249a0fc07d1461b42d58519266805bc51a6a94e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _standard_gamma_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & _standard_gamma_outf(const at::Tensor & self, ::std::optional generator, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8b6f881f166a30ba184f2ec5f9af048b887dd640 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _standard_gamma_grad_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & output); +TORCH_API at::Tensor & _standard_gamma_grad_outf(const at::Tensor & self, const at::Tensor & output, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..54341165b2b3f4bf25433b7898df896efeef2751 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _standard_gamma_grad(const at::Tensor & self, const at::Tensor & output); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_ambiguous_defaults_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_ambiguous_defaults_native.h new file mode 100644 index 0000000000000000000000000000000000000000..19b9f5c64bd6a89d3a7ebe28878e42580b8c516d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_ambiguous_defaults_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _test_ambiguous_defaults(const at::Tensor & dummy, int64_t a=1, int64_t b=1); +TORCH_API at::Tensor _test_ambiguous_defaults(const at::Tensor & dummy, int64_t a=2, c10::string_view b="2"); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f40182a055fe089878a17d09c7f6f2f8366a7bb0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _test_autograd_multiple_dispatch(const at::Tensor & self, bool b); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..219fa6399b3b02a7c7ae8cd3a92a00e76dc672b1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_test_check_tensor(Tensor self) -> Tensor +inline at::Tensor _test_check_tensor(const at::Tensor & self) { + return at::_ops::_test_check_tensor::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..752fc286ba3dc65363c80c6034abed9424fee34d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_check_tensor_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _test_check_tensor(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_functorch_fallback.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_functorch_fallback.h new file mode 100644 index 0000000000000000000000000000000000000000..fa3986e2d107e545a79bea4ab5d1cebbb16e0bce --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_functorch_fallback.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_test_functorch_fallback(Tensor self, Tensor other) -> Tensor +inline at::Tensor _test_functorch_fallback(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::_test_functorch_fallback::call(self, other); +} + +// aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _test_functorch_fallback_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::_test_functorch_fallback_out::call(self, other, out); +} +// aten::_test_functorch_fallback.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _test_functorch_fallback_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::_test_functorch_fallback_out::call(self, other, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f0ecddf3e0510315944de0337a9442c789158245 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_floatlist_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _test_optional_floatlist_out(at::Tensor & out, const at::Tensor & values, ::std::optional> addends); +TORCH_API at::Tensor & _test_optional_floatlist_outf(const at::Tensor & values, ::std::optional> addends, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_intlist.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_intlist.h new file mode 100644 index 0000000000000000000000000000000000000000..076eb8ced1db6a42344740fa94211dd5fc0db374 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_optional_intlist.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_test_optional_intlist(Tensor values, int[]? addends) -> Tensor +inline at::Tensor _test_optional_intlist(const at::Tensor & values, at::OptionalIntArrayRef addends) { + return at::_ops::_test_optional_intlist::call(values, addends); +} + +// aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _test_optional_intlist_out(at::Tensor & out, const at::Tensor & values, at::OptionalIntArrayRef addends) { + return at::_ops::_test_optional_intlist_out::call(values, addends, out); +} +// aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _test_optional_intlist_outf(const at::Tensor & values, at::OptionalIntArrayRef addends, at::Tensor & out) { + return at::_ops::_test_optional_intlist_out::call(values, addends, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_parallel_materialize_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_parallel_materialize_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2f39d5f659edcf809a1daf7a7a30ed0e159cf178 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_test_parallel_materialize_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _test_parallel_materialize(const at::Tensor & self, int64_t num_parallel, bool skip_first=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell.h new file mode 100644 index 0000000000000000000000000000000000000000..f211cc24353fbc9c7475698dc45393b0e571e12a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_thnn_fused_gru_cell(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None) -> (Tensor, Tensor) +inline ::std::tuple _thnn_fused_gru_cell(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias={}, const ::std::optional & hidden_bias={}) { + return at::_ops::_thnn_fused_gru_cell::call(input_gates, hidden_gates, hx, input_bias, hidden_bias); +} + +// aten::_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _thnn_fused_gru_cell_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias={}, const ::std::optional & hidden_bias={}) { + return at::_ops::_thnn_fused_gru_cell_out::call(input_gates, hidden_gates, hx, input_bias, hidden_bias, out0, out1); +} +// aten::_thnn_fused_gru_cell.out(Tensor input_gates, Tensor hidden_gates, Tensor hx, Tensor? input_bias=None, Tensor? hidden_bias=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _thnn_fused_gru_cell_outf(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const ::std::optional & input_bias, const ::std::optional & hidden_bias, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::_thnn_fused_gru_cell_out::call(input_gates, hidden_gates, hx, input_bias, hidden_bias, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2e1b161cfff7e7a8225b59ebf12ee2c0299a4fc4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _thnn_fused_gru_cell_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_thnn_fused_gru_cell_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_thnn_fused_gru_cell_backward(Tensor grad_hy, Tensor workspace, bool has_bias) -> (Tensor, Tensor, Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias); +}; + +struct TORCH_API _thnn_fused_gru_cell_backward_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_thnn_fused_gru_cell_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_thnn_fused_gru_cell_backward.out(Tensor grad_hy, Tensor workspace, bool has_bias, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!))"; + static ::std::tuple call(const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_hy, const at::Tensor & workspace, bool has_bias, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ce0340c7d1100eff4e1c5bf1125d3d565cdead --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options={}, bool non_blocking=false, ::std::optional memory_format=::std::nullopt) { + return at::_ops::_to_copy::call(self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), non_blocking, c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor _to_copy(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, bool non_blocking, ::std::optional memory_format) { + return at::_ops::_to_copy::call(self, dtype, layout, device, pin_memory, non_blocking, memory_format); +} + +// aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _to_copy_out(at::Tensor & out, const at::Tensor & self, bool non_blocking=false, ::std::optional memory_format=::std::nullopt) { + return at::_ops::_to_copy_out::call(self, non_blocking, memory_format, out); +} +// aten::_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _to_copy_outf(const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::_to_copy_out::call(self, non_blocking, memory_format, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d2474019f7853b95bd4e81a1f3f66bb4f7bfb88e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _to_copy(const at::Tensor & self, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, bool non_blocking=false, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor & _to_copy_out(const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor _to_copy_nested(const at::Tensor & self, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, bool non_blocking=false, ::std::optional memory_format=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0d01020c46abca44f72082c8f37cd0c9cc88537d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _to_copy { + using schema = at::Tensor (const at::Tensor &, ::std::optional, ::std::optional, ::std::optional, ::std::optional, bool, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, bool non_blocking, ::std::optional memory_format); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, bool non_blocking, ::std::optional memory_format); +}; + +struct TORCH_API _to_copy_out { + using schema = at::Tensor & (const at::Tensor &, bool, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_to_copy.out(Tensor self, *, bool non_blocking=False, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool non_blocking, ::std::optional memory_format, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_cpu_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_cpu_native.h new file mode 100644 index 0000000000000000000000000000000000000000..06fd3ac304b9b9ede28ec9c7045da407eee8669a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_cpu_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector _to_cpu(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr.h new file mode 100644 index 0000000000000000000000000000000000000000..bfe3201138f9c7e315de16551794f16000d08bbc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _to_sparse_bsr_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim=::std::nullopt) { + return at::_ops::_to_sparse_bsr_out::call(self, blocksize, dense_dim, out); +} +// aten::_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _to_sparse_bsr_outf(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim, at::Tensor & out) { + return at::_ops::_to_sparse_bsr_out::call(self, blocksize, dense_dim, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7e38fce86eb4a5cf711e50d808e9b5cf2ce30942 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _to_sparse_bsr { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_sparse_bsr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); +}; + +struct TORCH_API _to_sparse_bsr_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_to_sparse_bsr"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..939346d9f1dcb5a7232297ed33578561da565021 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _to_sparse_csc(const at::Tensor & self, ::std::optional dense_dim=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0ee3248d33e912610d22bec1ba303798f769823a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _transform_bias_rescale_qkv_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads); +TORCH_API ::std::tuple _transform_bias_rescale_qkv_outf(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ae4083ff8356301f6dae6a745aa3edc9e438f495 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_transform_bias_rescale_qkv_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _transform_bias_rescale_qkv { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_transform_bias_rescale_qkv"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_transform_bias_rescale_qkv(Tensor qkv, Tensor qkv_bias, int num_heads) -> (Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads); +}; + +struct TORCH_API _transform_bias_rescale_qkv_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_transform_bias_rescale_qkv"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_transform_bias_rescale_qkv.out(Tensor qkv, Tensor qkv_bias, int num_heads, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))"; + static ::std::tuple call(const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & qkv, const at::Tensor & qkv_bias, int64_t num_heads, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d2bd218485f9d276058ce2446cccd4bc123a8e60 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_triton_scaled_dot_attention_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _triton_scaled_dot_attention_out(at::Tensor & out, const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p=0.0); +TORCH_API at::Tensor & _triton_scaled_dot_attention_outf(const at::Tensor & q, const at::Tensor & k, const at::Tensor & v, double dropout_p, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique.h new file mode 100644 index 0000000000000000000000000000000000000000..b6891eedd3cf1797bac158db4b4d64bc2ca575cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor) +inline ::std::tuple _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false) { + return at::_ops::_unique::call(self, sorted, return_inverse); +} + +// aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _unique_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, bool sorted=true, bool return_inverse=false) { + return at::_ops::_unique_out::call(self, sorted, return_inverse, out0, out1); +} +// aten::_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _unique_outf(const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::_unique_out::call(self, sorted, return_inverse, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique2_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique2_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b7b3986828d9576fca0e94f28e1396f693ffe2cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique2_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _unique2_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, bool sorted=true, bool return_inverse=false, bool return_counts=false); +TORCH_API ::std::tuple _unique2_outf(const at::Tensor & self, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..73f2304be212fda3479f6266fb381295459cd817 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple _unique(const at::Tensor & self, bool sorted=true, bool return_inverse=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0e1bc0e9b273277f6707ee1d5d9a28a5fd15d138 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unique_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _unique { + using schema = ::std::tuple (const at::Tensor &, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_unique"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_unique(Tensor self, bool sorted=True, bool return_inverse=False) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, bool sorted, bool return_inverse); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted, bool return_inverse); +}; + +struct TORCH_API _unique_out { + using schema = ::std::tuple (const at::Tensor &, bool, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_unique"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_unique.out(Tensor self, bool sorted=True, bool return_inverse=False, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool sorted, bool return_inverse, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index.h new file mode 100644 index 0000000000000000000000000000000000000000..611f94dd58e560533cd694ee800990a46595743d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_unsafe_masked_index(Tensor self, Tensor mask, Tensor?[] indices, Scalar fill) -> Tensor +inline at::Tensor _unsafe_masked_index(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Scalar & fill) { + return at::_ops::_unsafe_masked_index::call(self, mask, indices, fill); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..87b55d7b8275073493f218460560cf468dc3bbae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor _unsafe_masked_index(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Scalar & fill); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3ada4a7be7abc8f3c84b3fbec2efcb6939bfe577 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _unsafe_masked_index_put_accumulate(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..716b3587080fd6c7623546d8a8ab39b50dbf429b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _unsafe_masked_index_put_accumulate { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::List<::std::optional> &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_unsafe_masked_index_put_accumulate"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_unsafe_masked_index_put_accumulate(Tensor self, Tensor mask, Tensor?[] indices, Tensor values) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0300e1f673106d97d40900676b505487f5628920 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _upsample_bicubic2d_aa_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, ::std::optional, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +}; + +struct TORCH_API _upsample_bicubic2d_aa_backward { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f19b1374d854ef82f45dcc607ddc88eaa05d0bc3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _upsample_bicubic2d_aa_vec { + using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, bool, ::std::optional>); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa"; + static constexpr const char* overload_name = "vec"; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor"; + static at::Tensor call(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); +}; + +struct TORCH_API _upsample_bicubic2d_aa_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, bool, ::std::optional, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +}; + +struct TORCH_API _upsample_bicubic2d_aa { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, bool, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_upsample_bicubic2d_aa"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..e74dfcf162a5a51caf3c95a94b082b56990402bb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_bilinear2d_aa_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & _upsample_bilinear2d_aa_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); +} +namespace symint { + template >> + at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input) { + return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input); + } +} + +// aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w); + } +} + +// aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor +inline at::Tensor _upsample_bilinear2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w); +} +namespace symint { + template >> + at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt) { + return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a327a94d0712e6328cbea4551994f9343fba1ecb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_bilinear2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4a4b329213f9d478294f85f77691275ec93708a2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured__upsample_bilinear2d_aa_backward_out_cpu : public at::meta::structured__upsample_bilinear2d_aa_backward { +void impl(const at::Tensor & grad_output, at::ArrayRef output_size, at::ArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, const at::Tensor & grad_input); +}; +struct TORCH_API structured__upsample_bilinear2d_aa_backward_out_cuda : public at::meta::structured__upsample_bilinear2d_aa_backward { +void impl(const at::Tensor & grad_output, at::ArrayRef output_size, at::ArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e66ac643f7fdc2cef5bf38856463fcf0ad6072f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor _upsample_bilinear2d_aa(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); +TORCH_API at::Tensor _upsample_bilinear2d_aa_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..44e8f550e8e78b94bba586917a7e39ac0acf1869 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_nearest_exact2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact2d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact2d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce1394832fda673c044b579554af20ee047b435c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..920b6bb15d74b7ff09cdb07a8fc53faa9c2c81eb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d=::std::nullopt, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8ec5be86aff469f10ceb0eb9c335f5f6513e3ca2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API void _validate_compressed_sparse_indices_cpu(bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz); +TORCH_API void _validate_compressed_sparse_indices_cuda(bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..600078dc02cf1542fc314968fc5752c7029742e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_bsc_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_bsc_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args.h new file mode 100644 index 0000000000000000000000000000000000000000..0a9928ceda0e5b1db79d53c11a350dbb1e6821b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> () +inline void _validate_sparse_coo_tensor_args(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional is_coalesced=::std::nullopt, ::std::optional check_pinning=::std::nullopt) { + return at::_ops::_validate_sparse_coo_tensor_args::call(indices, values, size, is_coalesced, check_pinning); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1875fc30a94296112e6a8e80bbd458cae0314779 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_coo_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_coo_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_coo_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_coo_tensor_args(Tensor indices, Tensor values, int[] size, bool? is_coalesced=None, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional is_coalesced, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional is_coalesced, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csc_tensor_args_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csc_tensor_args_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d901e5d18edabdf0f36107cf81e4a2ed111044e0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csc_tensor_args_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API void _validate_sparse_csc_tensor_args(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning=::std::nullopt); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csc_tensor_args_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csc_tensor_args_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..6460fb70684aec838e2e858201916df9c3842eb9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csc_tensor_args_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _validate_sparse_csc_tensor_args { + using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_validate_sparse_csc_tensor_args"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_validate_sparse_csc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size, bool? check_pinning=None) -> ()"; + static void call(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h new file mode 100644 index 0000000000000000000000000000000000000000..0549a303cc0d7859e43f7e8829d8f86eaeca5c6b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, bool? check_pinning=None) -> () +inline void _validate_sparse_csr_tensor_args(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional check_pinning=::std::nullopt) { + return at::_ops::_validate_sparse_csr_tensor_args::call(crow_indices, col_indices, values, size, check_pinning); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_values_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_values_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1f87c77ef661d05dcde2cc4a74bbf73b0d3f00e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_values_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _values_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor _values_copy(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8bdc660103f98e3ada129b8941d7bd7321aa85e0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_int4pack_mm_with_scales_and_zeros_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _weight_int4pack_mm_with_scales_and_zeros { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_weight_int4pack_mm_with_scales_and_zeros"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_weight_int4pack_mm_with_scales_and_zeros(Tensor self, Tensor mat2, int qGroupSize, Tensor qScale, Tensor qZeros) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat2, int64_t qGroupSize, const at::Tensor & qScale, const at::Tensor & qZeros); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, int64_t qGroupSize, const at::Tensor & qScale, const at::Tensor & qZeros); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_differentiable_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_differentiable_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..f3625fb931b4817778e6d62967faf2a2002493b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_differentiable_backward.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_weight_norm_differentiable_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor) +inline ::std::tuple _weight_norm_differentiable_backward(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim) { + return at::_ops::_weight_norm_differentiable_backward::call(grad_w, saved_v, saved_g, saved_norms, dim); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..7284e36e5f93285f62881b41772cd9b08133c752 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_weight_norm_interface(Tensor v, Tensor g, int dim=0) -> (Tensor, Tensor) +inline ::std::tuple _weight_norm_interface(const at::Tensor & v, const at::Tensor & g, int64_t dim=0) { + return at::_ops::_weight_norm_interface::call(v, g, dim); +} + +// aten::_weight_norm_interface.out(Tensor v, Tensor g, int dim=0, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _weight_norm_interface_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & v, const at::Tensor & g, int64_t dim=0) { + return at::_ops::_weight_norm_interface_out::call(v, g, dim, out0, out1); +} +// aten::_weight_norm_interface.out(Tensor v, Tensor g, int dim=0, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!)) +inline ::std::tuple _weight_norm_interface_outf(const at::Tensor & v, const at::Tensor & g, int64_t dim, at::Tensor & out0, at::Tensor & out1) { + return at::_ops::_weight_norm_interface_out::call(v, g, dim, out0, out1); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2cf979880efa45b32df00caa71086f490e485d23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _weight_norm_interface_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_weight_norm_interface_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "_weight_norm_interface_backward(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim); +}; + +struct TORCH_API _weight_norm_interface_backward_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::_weight_norm_interface_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "_weight_norm_interface_backward.out(Tensor grad_w, Tensor saved_v, Tensor saved_g, Tensor saved_norms, int dim, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6041c636e03296095bb7e8b6dbfa8ac6348bf90a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_weight_norm_interface_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple _weight_norm_interface_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & v, const at::Tensor & g, int64_t dim=0); +TORCH_API ::std::tuple _weight_norm_interface_outf(const at::Tensor & v, const at::Tensor & g, int64_t dim, at::Tensor & out0, at::Tensor & out1); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a01589c0e7c0be131debfb3ecbd02947268305b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acos_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor acos(const at::Tensor & self); +TORCH_API at::Tensor & acos_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acos_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acos_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e20a5ba9353b8cafe3ba9acd2953c590ce8fd49e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acos_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor acos(const at::Tensor & self); +TORCH_API at::Tensor & acos_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & acos_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & acos_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acosh_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acosh_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3e667e60844aa36b80a8ca2dbd4ae3b97b1b4e3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/acosh_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor acosh(const at::Tensor & self); +TORCH_API at::Tensor & acosh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & acosh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & acosh_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a04d9c7ef945d841f22d96e901d9321d26900cc5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool1d_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor adaptive_avg_pool1d(const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool1d_out(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..23a04ae49197dbab6b7082a3cccf6c09b81b9170 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, c10::fromIntArrayRefSlow(output_size), out); + } +} + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +// aten::adaptive_avg_pool2d.out(Tensor self, SymInt[2] output_size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); +} +namespace symint { + template >> + at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out) { + return at::_ops::adaptive_avg_pool2d_out::call(self, output_size, out); + } +} + +// aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); +} +namespace symint { + template >> + at::Tensor adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, c10::fromIntArrayRefSlow(output_size)); + } +} + +// aten::adaptive_avg_pool2d(Tensor self, SymInt[2] output_size) -> Tensor +inline at::Tensor adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, output_size); +} +namespace symint { + template >> + at::Tensor adaptive_avg_pool2d(const at::Tensor & self, c10::SymIntArrayRef output_size) { + return at::_ops::adaptive_avg_pool2d::call(self, output_size); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b7a694fdd8830471714f56928c94455f41d6549e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); +TORCH_API at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..832dbb61199f59060cc83ff2b3de62c2775c54c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool2d_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out); +TORCH_API at::Tensor & adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size); +TORCH_API at::Tensor & adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..98822d0c96f16a4419610620aa64bb9726207982 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h @@ -0,0 +1,40 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input); +} +// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) { + return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..46d2c374bf0b738cade598885aa0ec5389972b70 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API adaptive_max_pool1d { + using schema = ::std::tuple (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool1d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "adaptive_max_pool1d(Tensor self, int[1] output_size) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, at::IntArrayRef output_size); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8330cc6042604efbe8beff0676c62a4ffc91210e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_adaptive_max_pool2d_backward_out_cpu : public at::meta::structured_adaptive_max_pool2d_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input); +}; +struct TORCH_API structured_adaptive_max_pool2d_backward_out_cuda : public at::meta::structured_adaptive_max_pool2d_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2082e948e489b05d505222d09c7ebd8a267d696b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API adaptive_max_pool2d_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool2d_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "adaptive_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input); +}; + +struct TORCH_API adaptive_max_pool2d_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool2d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "adaptive_max_pool2d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..e7da7709328608e719aa367c3044093196489553 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) { + return at::_ops::adaptive_max_pool3d_backward_grad_input::call(grad_output, self, indices, grad_input); +} +// aten::adaptive_max_pool3d_backward.grad_input(Tensor grad_output, Tensor self, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input) { + return at::_ops::adaptive_max_pool3d_backward_grad_input::call(grad_output, self, indices, grad_input); +} + +// aten::adaptive_max_pool3d_backward(Tensor grad_output, Tensor self, Tensor indices) -> Tensor +inline at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices) { + return at::_ops::adaptive_max_pool3d_backward::call(grad_output, self, indices); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5262359de7788dbca32a7d8c6f1141e017c9c490 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..b4b7238d3f172499694ef027aa6a7b61ec0dfa46 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API adaptive_max_pool3d_out { + using schema = ::std::tuple (const at::Tensor &, at::IntArrayRef, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool3d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "adaptive_max_pool3d.out(Tensor self, int[3] output_size, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out, at::Tensor & indices); +}; + +struct TORCH_API adaptive_max_pool3d { + using schema = ::std::tuple (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::adaptive_max_pool3d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "adaptive_max_pool3d(Tensor self, int[3] output_size) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, at::IntArrayRef output_size); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add.h new file mode 100644 index 0000000000000000000000000000000000000000..b316ed943e1cc3519a118da0315eec0d6c225240 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor +inline at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) { + return at::_ops::add_Tensor::call(self, other, alpha); +} + +// aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1) { + return at::_ops::add_out::call(self, other, alpha, out); +} +// aten::add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_outf(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out) { + return at::_ops::add_out::call(self, other, alpha, out); +} + +// aten::add.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor +inline at::Tensor add(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) { + return at::_ops::add_Scalar::call(self, other, alpha); +} + +// aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1) { + return at::_ops::add_Scalar_out::call(self, other, alpha, out); +} +// aten::add.Scalar_out(Tensor self, Scalar other, Scalar alpha=1, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & add_outf(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out) { + return at::_ops::add_Scalar_out::call(self, other, alpha, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e2b2e15c4be4b38d0bbd2314521e3c452186390 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3f993e23a6cfa658a0e40fa5a98c152595bfd63e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/add_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & add_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & add_outf(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & add_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addbmm_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addbmm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0d04c770d424c17f4814645d2a4f5074c61c8418 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addbmm_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor addbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addbmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addbmm_outf(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & addbmm_(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addbmm_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addbmm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fea941faee70905d220d36e62a113f3ffa3afe97 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addbmm_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor addbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addbmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addbmm_outf(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & addbmm_(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta=1, const at::Scalar & alpha=1); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..81929351f37a05f163e8857b8758a2ddbe8cb39c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addcdiv_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1); +TORCH_API at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1); +TORCH_API at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out); +TORCH_API at::Tensor & addcdiv_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b4da70b420338b3a31dd3c7f5c0c0d9aa691ce11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_native.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_addmm_out_cpu : public at::meta::structured_addmm { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, const at::Tensor & out); +}; +struct TORCH_API structured_addmm_out_cuda : public at::meta::structured_addmm { +void impl(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, const at::Tensor & out); +}; +TORCH_API at::Tensor addmm_sparse_dense_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addmm_out_sparse_dense_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & s_addmm_sparse_dense_cpu_(at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor addmm_sparse_dense_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addmm_out_sparse_dense_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & s_addmm_sparse_dense_cuda_(at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor addmm_sparse_compressed_dense(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & addmm_out_sparse_compressed_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & addmm_out_sparse_compressed_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor _addmm_dtype_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta=1, const at::Scalar & alpha=1); +TORCH_API at::Tensor & _addmm_dtype_out_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ed7d4ff3f299962c10ebf42e4933f539e929e97b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmm_ops.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API addmm_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +struct TORCH_API addmm { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API addmm_dtype { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = "dtype"; + static constexpr const char* schema_str = "addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API addmm_dtype_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm"; + static constexpr const char* overload_name = "dtype_out"; + static constexpr const char* schema_str = "addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +struct TORCH_API addmm_ { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmm_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..8df48fed3202f123cf01cd991bd8ef64c9cf716b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_addmv : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c56ab77c8c1cf46119d754864b23007477a99fec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/addmv_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API addmv { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmv"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "addmv(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API addmv_ { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmv_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "addmv_(Tensor(a!) self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API addmv_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::addmv"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "addmv.out(Tensor self, Tensor mat, Tensor vec, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat, const at::Tensor & vec, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adjoint_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adjoint_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f16b41dc4d0050acd9f51771585cf21af607e771 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/adjoint_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor adjoint(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/alias.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/alias.h new file mode 100644 index 0000000000000000000000000000000000000000..a4d24b44737f76521e94c3e2e27a9af561cc2741 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/alias.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::alias(Tensor(a) self) -> Tensor(a) +inline at::Tensor alias(const at::Tensor & self) { + return at::_ops::alias::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/alias_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/alias_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e9c495b26d15911dd26637d21b1f635857087bdc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/alias_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor alias(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/amin.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/amin.h new file mode 100644 index 0000000000000000000000000000000000000000..b9d006382c04f5349358f8939d80ebd5c81b63d1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/amin.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor +inline at::Tensor amin(const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) { + return at::_ops::amin::call(self, dim, keepdim); +} + +// aten::amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & amin_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) { + return at::_ops::amin_out::call(self, dim, keepdim, out); +} +// aten::amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & amin_outf(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) { + return at::_ops::amin_out::call(self, dim, keepdim, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/angle_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/angle_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4c96c857dd55c790e42882f0a76d9a7543872311 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/angle_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor angle(const at::Tensor & self); +TORCH_API at::Tensor & angle_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor angle_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & angle_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/angle_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/angle_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4209716da85f17a20a46b036558c0b6155d1d87f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/angle_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API angle { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::angle"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "angle(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API angle_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::angle"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "angle.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/any.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/any.h new file mode 100644 index 0000000000000000000000000000000000000000..23c1927e7036765249e2fea201f8b36d0529aab4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/any.h @@ -0,0 +1,87 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::any.dim(Tensor self, int dim, bool keepdim=False) -> Tensor +inline at::Tensor any(const at::Tensor & self, int64_t dim, bool keepdim=false) { + return at::_ops::any_dim::call(self, dim, keepdim); +} + +// aten::any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor +inline at::Tensor any(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) { + return at::_ops::any_dims::call(self, dim, keepdim); +} + +// aten::any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false) { + return at::_ops::any_out::call(self, dim, keepdim, out); +} +// aten::any.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_outf(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out) { + return at::_ops::any_out::call(self, dim, keepdim, out); +} + +// aten::any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false) { + return at::_ops::any_dims_out::call(self, dim, keepdim, out); +} +// aten::any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out) { + return at::_ops::any_dims_out::call(self, dim, keepdim, out); +} + +// aten::any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor +inline at::Tensor any(const at::Tensor & self, at::Dimname dim, bool keepdim=false) { + return at::_ops::any_dimname::call(self, dim, keepdim); +} + +// aten::any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim, bool keepdim=false) { + return at::_ops::any_dimname_out::call(self, dim, keepdim, out); +} +// aten::any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_outf(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out) { + return at::_ops::any_dimname_out::call(self, dim, keepdim, out); +} + +// aten::any(Tensor self) -> Tensor +inline at::Tensor any(const at::Tensor & self) { + return at::_ops::any::call(self); +} + +// aten::any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::any_all_out::call(self, out); +} +// aten::any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & any_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::any_all_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/any_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/any_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..95a4a60040f70ffb84d3062a607f838d5921095c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/any_cuda_dispatch.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor any(const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API at::Tensor & any_outf(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out); +TORCH_API at::Tensor any(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & any_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out); +TORCH_API at::Tensor any(const at::Tensor & self); +TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & any_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arange_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arange_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..24a34bbbf5a85cf9ecd432fd0dccc7eb27926988 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arange_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & arange_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, const at::Scalar & step); +TORCH_API at::Tensor & arange_outf(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arccosh_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arccosh_native.h new file mode 100644 index 0000000000000000000000000000000000000000..067c6ab35c331b610df9e8f6671280d54b9704b5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arccosh_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor arccosh(const at::Tensor & self); +TORCH_API at::Tensor & arccosh_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & arccosh_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..137da143773699330c35ef818857487ed9f078bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor arcsinh(const at::Tensor & self); +TORCH_API at::Tensor & arcsinh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & arcsinh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & arcsinh_(at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b19138420ea8f4b1b7078d946a218fe700acc674 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API const at::Tensor & as_strided_(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt); +TORCH_API const at::Tensor & as_strided__symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..0f43a394c8b55a4fd0cf577090e69510fe9a33f1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor +inline at::Tensor as_strided_copy(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt); +} +namespace symint { + template >> + at::Tensor as_strided_copy(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt); + } +} + +// aten::as_strided_copy(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None) -> Tensor +inline at::Tensor as_strided_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, size, stride, storage_offset); +} +namespace symint { + template >> + at::Tensor as_strided_copy(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy::call(self, size, stride, storage_offset); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_outf(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_outf(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), storage_offset.has_value() ? ::std::make_optional(c10::SymInt(*storage_offset)) : ::std::nullopt, out); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); + } +} + +// aten::as_strided_copy.out(Tensor self, SymInt[] size, SymInt[] stride, SymInt? storage_offset=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & as_strided_copy_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); +} +namespace symint { + template >> + at::Tensor & as_strided_copy_outf(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset, at::Tensor & out) { + return at::_ops::as_strided_copy_out::call(self, size, stride, storage_offset, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2d2076519cf14c0107a7357ef4e5b03ff2f04032 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_copy_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor as_strided_copy(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt); +TORCH_API at::Tensor as_strided_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..385548cee575a07cecbca49545a9852166ec39c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor as_strided(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt); +TORCH_API at::Tensor as_strided_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6cb5a721773e522b779c423db8b803e16c954472 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/as_strided_scatter_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor as_strided_scatter(const at::Tensor & self, const at::Tensor & src, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional storage_offset=::std::nullopt); +TORCH_API at::Tensor as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional storage_offset=::std::nullopt); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_native.h new file mode 100644 index 0000000000000000000000000000000000000000..89d10c4042914f63caa58725feb42d7888fa5c44 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/asinh_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_asinh_out : public at::meta::structured_asinh { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +TORCH_API at::Tensor asinh_sparse(const at::Tensor & self); +TORCH_API at::Tensor & asinh_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & asinh_sparse_(at::Tensor & self); +TORCH_API at::Tensor asinh_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & asinh_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & asinh_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan.h new file mode 100644 index 0000000000000000000000000000000000000000..bfae5e8938a1835ef501168a380862b905f47eb7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::atan(Tensor self) -> Tensor +inline at::Tensor atan(const at::Tensor & self) { + return at::_ops::atan::call(self); +} + +// aten::atan_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & atan_(at::Tensor & self) { + return at::_ops::atan_::call(self); +} + +// aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & atan_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::atan_out::call(self, out); +} +// aten::atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & atan_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::atan_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan2_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan2_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..feb3c44aeb4aa2b171bd9cff18f1290f72285949 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan2_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_atan2 : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c993cad67f9d1b95fa141f1ad60b181370c12c1e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atan_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API atan { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::atan"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "atan(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API atan_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::atan_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "atan_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API atan_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::atan"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "atan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_1d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_1d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0057ff3f832a4da2ab6f7084bbac97f856956f2a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_1d_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor atleast_1d(const at::Tensor & self); +TORCH_API ::std::vector atleast_1d(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_2d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_2d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f17f2bf37d843a90d135025871fc6b05ac68804c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_2d_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor atleast_2d(const at::Tensor & self); +TORCH_API ::std::vector atleast_2d(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d.h new file mode 100644 index 0000000000000000000000000000000000000000..13cbb7e625b141c6caf26072f55ed967359e6c4a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/atleast_3d.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::atleast_3d(Tensor self) -> Tensor +inline at::Tensor atleast_3d(const at::Tensor & self) { + return at::_ops::atleast_3d::call(self); +} + +// aten::atleast_3d.Sequence(Tensor[] tensors) -> Tensor[] +inline ::std::vector atleast_3d(at::TensorList tensors) { + return at::_ops::atleast_3d_Sequence::call(tensors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d.h new file mode 100644 index 0000000000000000000000000000000000000000..f7671ca6bd65381c8d6e4d42988a36bedd446189 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt) { + return at::_ops::avg_pool2d_out::call(self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out); +} +// aten::avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out) { + return at::_ops::avg_pool2d_out::call(self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override, out); +} + +// aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor +inline at::Tensor avg_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt) { + return at::_ops::avg_pool2d::call(self, kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f29db18d0f31e0a14aa7503f090d29ea14446817 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool2d_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor avg_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..20e4fc4119d0df0081ec8958b2a59ce18dd2a715 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor avg_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..335df8354423966ceea626bfdfdadc7c786ad641 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor avg_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3802fa1a0adda9a5e452dd11c46d5bbe49b4df53 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/avg_pool3d_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor avg_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, ::std::optional divisor_override=::std::nullopt); +TORCH_API at::Tensor & avg_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional divisor_override, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..552b076438f45350ae16e38f0294c6aab07823bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_baddbmm : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..18d0abe09a470d7f6d2ad4e00535e274a5a8d750 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/baddbmm_ops.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API baddbmm { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::baddbmm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "baddbmm(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API baddbmm_ { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::baddbmm_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "baddbmm_(Tensor(a!) self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API baddbmm_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::baddbmm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "baddbmm.out(Tensor self, Tensor batch1, Tensor batch2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +struct TORCH_API baddbmm_dtype { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::baddbmm"; + static constexpr const char* overload_name = "dtype"; + static constexpr const char* schema_str = "baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha); +}; + +struct TORCH_API baddbmm_dtype_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::baddbmm"; + static constexpr const char* overload_name = "dtype_out"; + static constexpr const char* schema_str = "baddbmm.dtype_out(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, at::ScalarType out_dtype, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bartlett_window_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bartlett_window_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5a5ebac4c52f2f6389deb200d88bfbca27c5d5d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bartlett_window_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor bartlett_window(int64_t window_length, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & bartlett_window_out(int64_t window_length, at::Tensor & out); +TORCH_API at::Tensor bartlett_window(int64_t window_length, bool periodic, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & bartlett_window_periodic_out(int64_t window_length, bool periodic, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..c200005888d5365049100870fbdc17aa0acd2490 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_backward.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor) +inline ::std::tuple batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_var, bool update, double eps, ::std::array output_mask, const at::Tensor & reserve) { + return at::_ops::batch_norm_backward::call(grad_out, input, weight, running_mean, running_var, save_mean, save_var, update, eps, output_mask, reserve); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..56eb03e2b603ae3a57c2685c95021847ea3446d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_elemt_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API batch_norm_elemt { + using schema = at::Tensor (const at::Tensor &, const ::std::optional &, const ::std::optional &, const at::Tensor &, const at::Tensor &, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::batch_norm_elemt"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "batch_norm_elemt(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps); +}; + +struct TORCH_API batch_norm_elemt_out { + using schema = at::Tensor & (const at::Tensor &, const ::std::optional &, const ::std::optional &, const at::Tensor &, const at::Tensor &, double, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::batch_norm_elemt"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "batch_norm_elemt.out(Tensor input, Tensor? weight, Tensor? bias, Tensor mean, Tensor invstd, float eps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const at::Tensor & mean, const at::Tensor & invstd, double eps, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_stats_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_stats_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5a3907b8cdddc185a054a5484dffc6343cd82053 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_stats_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple batch_norm_stats(const at::Tensor & input, double eps); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..be0c358b5875dbf6defce627c15fe8acd2881d4b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/batch_norm_update_stats_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple batch_norm_update_stats_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & input, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum); +TORCH_API ::std::tuple batch_norm_update_stats_outf(const at::Tensor & input, const ::std::optional & running_mean, const ::std::optional & running_var, double momentum, at::Tensor & out0, at::Tensor & out1); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bilinear_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bilinear_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..74ae89c9c069d95f8eaa4ca40900738aeb446896 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bilinear_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API bilinear { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::bilinear"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "bilinear(Tensor input1, Tensor input2, Tensor weight, Tensor? bias=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & weight, const ::std::optional & bias); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & weight, const ::std::optional & bias); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..def15556239972f5e1613226ba291b8c30d17907 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor +inline at::Tensor binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, int64_t reduction=at::Reduction::Mean) { + return at::_ops::binary_cross_entropy_backward::call(grad_output, self, target, weight, reduction); +} + +// aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & binary_cross_entropy_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, int64_t reduction=at::Reduction::Mean) { + return at::_ops::binary_cross_entropy_backward_grad_input::call(grad_output, self, target, weight, reduction, grad_input); +} +// aten::binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & binary_cross_entropy_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, at::Tensor & grad_input) { + return at::_ops::binary_cross_entropy_backward_grad_input::call(grad_output, self, target, weight, reduction, grad_input); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fb05703b0b97efbd62a36f1bbac614d5d8b3ec3c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API binary_cross_entropy_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::binary_cross_entropy_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction); +}; + +struct TORCH_API binary_cross_entropy_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::binary_cross_entropy_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "binary_cross_entropy_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, at::Tensor & grad_input); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_with_logits.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_with_logits.h new file mode 100644 index 0000000000000000000000000000000000000000..9e42832c8a4ce344351c47211cbcef81e8f86480 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binary_cross_entropy_with_logits.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor +inline at::Tensor binary_cross_entropy_with_logits(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, const ::std::optional & pos_weight={}, int64_t reduction=at::Reduction::Mean) { + return at::_ops::binary_cross_entropy_with_logits::call(self, target, weight, pos_weight, reduction); +} + +// aten::binary_cross_entropy_with_logits.out(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & binary_cross_entropy_with_logits_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, const ::std::optional & pos_weight={}, int64_t reduction=at::Reduction::Mean) { + return at::_ops::binary_cross_entropy_with_logits_out::call(self, target, weight, pos_weight, reduction, out); +} +// aten::binary_cross_entropy_with_logits.out(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & binary_cross_entropy_with_logits_outf(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, const ::std::optional & pos_weight, int64_t reduction, at::Tensor & out) { + return at::_ops::binary_cross_entropy_with_logits_out::call(self, target, weight, pos_weight, reduction, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bincount_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bincount_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9ed6a84888b0406d04e28cf8d8781326a706bff7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bincount_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & bincount_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & weights={}, int64_t minlength=0); +TORCH_API at::Tensor & bincount_outf(const at::Tensor & self, const ::std::optional & weights, int64_t minlength, at::Tensor & out); +TORCH_API at::Tensor & bincount_symint_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & weights={}, c10::SymInt minlength=0); +TORCH_API at::Tensor & bincount_symint_outf(const at::Tensor & self, const ::std::optional & weights, c10::SymInt minlength, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial.h new file mode 100644 index 0000000000000000000000000000000000000000..34eeaa547aed60de7dcee2a0909e64c5d9391dfa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::binomial(Tensor count, Tensor prob, Generator? generator=None) -> Tensor +inline at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt) { + return at::_ops::binomial::call(count, prob, generator); +} + +// aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & binomial_out(at::Tensor & out, const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt) { + return at::_ops::binomial_out::call(count, prob, generator, out); +} +// aten::binomial.out(Tensor count, Tensor prob, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & binomial_outf(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator, at::Tensor & out) { + return at::_ops::binomial_out::call(count, prob, generator, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..70b0df8ec385b40c466d9a55e98cef1da5f3432e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..879ea35fe4532747ef81d3b4fac7171820fe47cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/binomial_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor binomial(const at::Tensor & count, const at::Tensor & prob, ::std::optional generator=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7c2c6d5c7a73ba0af4b4940e07287374da1b1367 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor bitwise_not(const at::Tensor & self); +TORCH_API at::Tensor & bitwise_not_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..75841b9839dd04864c2059b096e9755768f005a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor bitwise_not(const at::Tensor & self); +TORCH_API at::Tensor & bitwise_not_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & bitwise_not_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & bitwise_not_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_native.h new file mode 100644 index 0000000000000000000000000000000000000000..97c7004831c2fcfad46ae023248553f4fdc51653 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_not_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_bitwise_not_out : public at::meta::structured_bitwise_not { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_xor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_xor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..06f3864084b728c7fdfd932fd34bee4fe9299b8c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bitwise_xor_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_bitwise_xor_out : public at::meta::structured_bitwise_xor_Tensor { +void impl(const at::Tensor & self, const at::Tensor & other, const at::Tensor & out); +}; +TORCH_API at::Tensor bitwise_xor(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & bitwise_xor_out(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & bitwise_xor_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor bitwise_xor(const at::Scalar & self, const at::Tensor & other); +TORCH_API at::Tensor & bitwise_xor_Scalar_Tensor_out(const at::Scalar & self, const at::Tensor & other, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/blackman_window_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/blackman_window_native.h new file mode 100644 index 0000000000000000000000000000000000000000..713522f06bfa8203019edf22af6d12b5dadd66c6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/blackman_window_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor blackman_window(int64_t window_length, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & blackman_window_out(int64_t window_length, at::Tensor & out); +TORCH_API at::Tensor blackman_window(int64_t window_length, bool periodic, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & blackman_window_periodic_out(int64_t window_length, bool periodic, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..906ac525b4a514d85b1db3bc63b51dd8439c2edb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b3e542af0134cf94d3bfc685c36ef282f614fd65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & bmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & bmm_outf(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +TORCH_API at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype); +TORCH_API at::Tensor & bmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype); +TORCH_API at::Tensor & bmm_outf(const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c4990e1c7c986796d7b783064127bc840fa26631 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/bmm_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API bmm { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::bmm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "bmm(Tensor self, Tensor mat2) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat2); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2); +}; + +struct TORCH_API bmm_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::bmm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); +}; + +struct TORCH_API bmm_dtype { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::ScalarType); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::bmm"; + static constexpr const char* overload_name = "dtype"; + static constexpr const char* schema_str = "bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype); +}; + +struct TORCH_API bmm_dtype_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::ScalarType, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::bmm"; + static constexpr const char* overload_name = "dtype_out"; + static constexpr const char* schema_str = "bmm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2, at::ScalarType out_dtype, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_tensors_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_tensors_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7aed549c8117ecaaa2b77968a9d812c3de540751 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_tensors_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector broadcast_tensors(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_to_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_to_native.h new file mode 100644 index 0000000000000000000000000000000000000000..60c36fda07c66b7660ae24145ad26f8aca634fdd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/broadcast_to_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor broadcast_to_symint(const at::Tensor & self, c10::SymIntArrayRef size); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cartesian_prod_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cartesian_prod_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2601175e5daf21d0fbc45cb281955763c9a5b7dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cartesian_prod_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor cartesian_prod(at::TensorList tensors); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cat_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cat_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..852b4aea82296da8ffc1c57286fb7f379e2b0e6d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cat_meta.h @@ -0,0 +1,119 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_cat : public at::impl::MetaBase { + + template + struct TORCH_API precompute_out { + + precompute_out set_dim(int64_t value) { + static_assert(DIM == false, "dim already set"); + precompute_out ret; +ret.dim = value; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_valid(int64_t value) { + static_assert(VALID == false, "valid already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = value; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_all_contiguous(bool value) { + static_assert(ALL_CONTIGUOUS == false, "all_contiguous already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = value; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_all_same_dtype(bool value) { + static_assert(ALL_SAME_DTYPE == false, "all_same_dtype already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = value; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_all_same_sizes_and_stride(bool value) { + static_assert(ALL_SAME_SIZES_AND_STRIDE == false, "all_same_sizes_and_stride already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = value; +ret.memory_format = this->memory_format; +return ret; + } + + + precompute_out set_memory_format(at::MemoryFormat value) { + static_assert(MEMORY_FORMAT == false, "memory_format already set"); + precompute_out ret; +ret.dim = this->dim; +ret.valid = this->valid; +ret.all_contiguous = this->all_contiguous; +ret.all_same_dtype = this->all_same_dtype; +ret.all_same_sizes_and_stride = this->all_same_sizes_and_stride; +ret.memory_format = value; +return ret; + } + + int64_t dim; +int64_t valid; +bool all_contiguous; +bool all_same_dtype; +bool all_same_sizes_and_stride; +at::MemoryFormat memory_format; + }; + using meta_return_ty = precompute_out ; + meta_return_ty meta(const at::ITensorListRef & tensors, int64_t dim); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cauchy_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cauchy_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..91314c81d2faaf43b2a18054b9b65d37e7ff9884 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cauchy_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & cauchy_(at::Tensor & self, double median=0, double sigma=1, ::std::optional generator=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fee4463fc2046666c59f487f69c468ba0c252f31 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ccol_indices_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API ccol_indices { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::ccol_indices"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "ccol_indices(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..75fccb6252a558388dbd07148877b7a1b11f50b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor ceil(const at::Tensor & self); +TORCH_API at::Tensor & ceil_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..32037c8f71fb650389bcdabc1b7018ed1720bed8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ceil_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor ceil(const at::Tensor & self); +TORCH_API at::Tensor & ceil_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & ceil_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & ceil_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/celu_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/celu_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1b409e1e21b71174824ddd9012aa27a6894f9add --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/celu_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API celu { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::celu"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "celu(Tensor self, Scalar alpha=1.0) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha); +}; + +struct TORCH_API celu_ { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::celu_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "celu_(Tensor(a!) self, Scalar alpha=1.0) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & alpha); +}; + +struct TORCH_API celu_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::celu"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "celu.out(Tensor self, Scalar alpha=1.0, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chalf.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chalf.h new file mode 100644 index 0000000000000000000000000000000000000000..f7d0d155a0e9732e27907b419efab2425d41f6ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chalf.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4f231628d44751d96914a54cf1f6166708edc772 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/chunk_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector chunk(const at::Tensor & self, int64_t chunks, int64_t dim=0); +TORCH_API ::std::vector chunk_nested_tensor(const at::Tensor & self, int64_t chunks, int64_t dim=0); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4be8931cb0789bd843e6a591724c60a44285a325 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor clamp(const at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor & clamp_(at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor clamp(const at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); +TORCH_API at::Tensor & clamp_(at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_max_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_max_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..66f43a2146f981d7214c5d33c0003eb436a7c6d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_max_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor clamp_max(const at::Tensor & self, const at::Scalar & max); +TORCH_API at::Tensor & clamp_max_(at::Tensor & self, const at::Scalar & max); +TORCH_API at::Tensor clamp_max(const at::Tensor & self, const at::Tensor & max); +TORCH_API at::Tensor & clamp_max_(at::Tensor & self, const at::Tensor & max); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..222b69f1b236e3cb28301e3fa54b1432f4895629 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_meta_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor clamp(const at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor & clamp_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor & clamp_outf(const at::Tensor & self, const ::std::optional & min, const ::std::optional & max, at::Tensor & out); +TORCH_API at::Tensor & clamp_(at::Tensor & self, const ::std::optional & min, const ::std::optional & max=::std::nullopt); +TORCH_API at::Tensor clamp(const at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); +TORCH_API at::Tensor & clamp_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); +TORCH_API at::Tensor & clamp_outf(const at::Tensor & self, const ::std::optional & min, const ::std::optional & max, at::Tensor & out); +TORCH_API at::Tensor & clamp_(at::Tensor & self, const ::std::optional & min={}, const ::std::optional & max={}); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_min_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_min_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..741373242dc24decbe3062b9a9eb943315f48068 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_min_cuda_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor clamp_min(const at::Tensor & self, const at::Scalar & min); +TORCH_API at::Tensor & clamp_min_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & min); +TORCH_API at::Tensor & clamp_min_outf(const at::Tensor & self, const at::Scalar & min, at::Tensor & out); +TORCH_API at::Tensor & clamp_min_(at::Tensor & self, const at::Scalar & min); +TORCH_API at::Tensor clamp_min(const at::Tensor & self, const at::Tensor & min); +TORCH_API at::Tensor & clamp_min_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & min); +TORCH_API at::Tensor & clamp_min_outf(const at::Tensor & self, const at::Tensor & min, at::Tensor & out); +TORCH_API at::Tensor & clamp_min_(at::Tensor & self, const at::Tensor & min); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_min_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_min_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..81d76c9003c53f22422732af03b1737fcbd27e70 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clamp_min_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_clamp_min : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & min); +}; +struct TORCH_API structured_clamp_min_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & min); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clone_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clone_native.h new file mode 100644 index 0000000000000000000000000000000000000000..330342a69c249c6e90980421d189ff46fb169a56 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/clone_native.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor clone(const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor & clone_out(const at::Tensor & self, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor clone_nested(const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor clone_sparse(const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor clone_sparse_compressed(const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor mkldnn_clone(const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor quantized_clone(const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/col2im_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/col2im_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c0bd850942970317d7a5925ee38883c26eb59337 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/col2im_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor col2im(const at::Tensor & self, at::IntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor col2im_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & col2im_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & col2im_outf(const at::Tensor & self, at::IntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out); +TORCH_API at::Tensor & col2im_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & col2im_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c9e952b2450c73d119002779fa6c575b2e760ece --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concat_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API concat { + using schema = at::Tensor (at::TensorList, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concat"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "concat(Tensor[] tensors, int dim=0) -> Tensor"; + static at::Tensor call(at::TensorList tensors, int64_t dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim); +}; + +struct TORCH_API concat_out { + using schema = at::Tensor & (at::TensorList, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concat"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(at::TensorList tensors, int64_t dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out); +}; + +struct TORCH_API concat_names { + using schema = at::Tensor (at::TensorList, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concat"; + static constexpr const char* overload_name = "names"; + static constexpr const char* schema_str = "concat.names(Tensor[] tensors, Dimname dim) -> Tensor"; + static at::Tensor call(at::TensorList tensors, at::Dimname dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim); +}; + +struct TORCH_API concat_names_out { + using schema = at::Tensor & (at::TensorList, at::Dimname, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concat"; + static constexpr const char* overload_name = "names_out"; + static constexpr const char* schema_str = "concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concatenate_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concatenate_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..cc25dc3e2086be166b5bb3c35840d499e36be449 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/concatenate_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API concatenate { + using schema = at::Tensor (at::TensorList, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concatenate"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "concatenate(Tensor[] tensors, int dim=0) -> Tensor"; + static at::Tensor call(at::TensorList tensors, int64_t dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim); +}; + +struct TORCH_API concatenate_out { + using schema = at::Tensor & (at::TensorList, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concatenate"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "concatenate.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(at::TensorList tensors, int64_t dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out); +}; + +struct TORCH_API concatenate_names { + using schema = at::Tensor (at::TensorList, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concatenate"; + static constexpr const char* overload_name = "names"; + static constexpr const char* schema_str = "concatenate.names(Tensor[] tensors, Dimname dim) -> Tensor"; + static at::Tensor call(at::TensorList tensors, at::Dimname dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim); +}; + +struct TORCH_API concatenate_names_out { + using schema = at::Tensor & (at::TensorList, at::Dimname, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::concatenate"; + static constexpr const char* overload_name = "names_out"; + static constexpr const char* schema_str = "concatenate.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conj_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conj_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ef366ceaaeb9005e63ad90257051683047b2ae07 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conj_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API conj { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::conj"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "conj(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0e2a61f11b9a6461d67dd4ea3aa58b2fc02ddd0a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_depthwise3d_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & conv_depthwise3d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation); +TORCH_API at::Tensor & conv_depthwise3d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, at::Tensor & out); +TORCH_API at::Tensor & conv_depthwise3d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation); +TORCH_API at::Tensor & conv_depthwise3d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_tbc_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_tbc_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..098f63214ca4b5301bf9ccb99924ab823460c50b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_tbc_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API conv_tbc_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::conv_tbc_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "conv_tbc_backward(Tensor self, Tensor input, Tensor weight, Tensor bias, int pad) -> (Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & self, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, int64_t pad); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & input, const at::Tensor & weight, const at::Tensor & bias, int64_t pad); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d.h new file mode 100644 index 0000000000000000000000000000000000000000..04ef9e280238a8ab81655cc34eedade4f672e1de --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor +inline at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation)); +} +namespace symint { + template >> + at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(output_padding), groups, c10::fromIntArrayRefSlow(dilation)); + } +} + +// aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] output_padding=0, SymInt groups=1, SymInt[2] dilation=1) -> Tensor +inline at::Tensor conv_transpose2d_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, stride, padding, output_padding, groups, dilation); +} +namespace symint { + template >> + at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)) { + return at::_ops::conv_transpose2d_input::call(input, weight, bias, stride, padding, output_padding, groups, dilation); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..db117ec4d6fe7608e9c6062da2331098dc34ec66 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose2d_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor conv_transpose2d(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, int64_t groups=1, at::IntArrayRef dilation=1); +TORCH_API at::Tensor conv_transpose2d_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose3d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose3d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d7f39759ec774528bb7cfc82f2d8d17eafaa692d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/conv_transpose3d_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor conv_transpose3d_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymInt groups=1, c10::SymIntArrayRef dilation=c10::SymInt(1)); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_native.h new file mode 100644 index 0000000000000000000000000000000000000000..056b9615260f91646f3b10cd3c376d85439c1b1f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups); +TORCH_API at::Tensor & convolution_out_symint(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_overrideable_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_overrideable_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4be5a335445b01c6a7dafe761006f45b9f1096fc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/convolution_overrideable_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API convolution_overrideable { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::convolution_overrideable"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "convolution_overrideable(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups); +}; + +struct TORCH_API convolution_overrideable_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, c10::SymIntArrayRef, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::convolution_overrideable"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "convolution_overrideable.out(Tensor input, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copy_sparse_to_sparse_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copy_sparse_to_sparse_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4fbf45bc80d43266dccda6dc279369431083744c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/copy_sparse_to_sparse_compositeexplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor copy_sparse_to_sparse(const at::Tensor & self, const at::Tensor & src, bool non_blocking=false); +TORCH_API at::Tensor & copy_sparse_to_sparse_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & src, bool non_blocking=false); +TORCH_API at::Tensor & copy_sparse_to_sparse_outf(const at::Tensor & self, const at::Tensor & src, bool non_blocking, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f93e4933703e687ab40a53cdc6fa1aedeab145c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor cosh(const at::Tensor & self); +TORCH_API at::Tensor & cosh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & cosh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & cosh_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..283aefb828d046a47e74d46f0efabf553b1b27dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosh_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor cosh(const at::Tensor & self); +TORCH_API at::Tensor & cosh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & cosh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & cosh_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosine_similarity_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosine_similarity_native.h new file mode 100644 index 0000000000000000000000000000000000000000..87bc92c466fb251ff76225b3e967bcb44330bacd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cosine_similarity_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor cosine_similarity(const at::Tensor & x1, const at::Tensor & x2, int64_t dim=1, double eps=1e-08); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/count_nonzero_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/count_nonzero_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f3aa9a7ff1a5670a6a294a8e47f228e8d376d0b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/count_nonzero_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & count_nonzero_dim_IntList_out(const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); +TORCH_API at::Tensor count_nonzero_cpu(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor count_nonzero_cuda(const at::Tensor & self, at::IntArrayRef dim); +TORCH_API at::Tensor count_nonzero(const at::Tensor & self, ::std::optional dim=::std::nullopt); +TORCH_API at::Tensor & count_nonzero_out(const at::Tensor & self, ::std::optional dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cov.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cov.h new file mode 100644 index 0000000000000000000000000000000000000000..bf109864c17de3123a1126a749d918e6c4eb949e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cov.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor +inline at::Tensor cov(const at::Tensor & self, int64_t correction=1, const ::std::optional & fweights={}, const ::std::optional & aweights={}) { + return at::_ops::cov::call(self, correction, fweights, aweights); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cov_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cov_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ead826c68fb4c4c9c4ccd5263316d5ba318c142a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cov_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cov { + using schema = at::Tensor (const at::Tensor &, int64_t, const ::std::optional &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cov"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t correction, const ::std::optional & fweights, const ::std::optional & aweights); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t correction, const ::std::optional & fweights, const ::std::optional & aweights); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cross_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cross_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fd531409d7e209675ef4df47a7a8eb071adb51bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cross_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cross_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cross"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "cross.out(Tensor self, Tensor other, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, ::std::optional dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, ::std::optional dim, at::Tensor & out); +}; + +struct TORCH_API cross { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cross"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cross(Tensor self, Tensor other, int? dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, ::std::optional dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, ::std::optional dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..43302599583ecf763fc64465a9d0f9c689e4b320 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & cudnn_convolution_transpose_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32); +TORCH_API at::Tensor & cudnn_convolution_transpose_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out); +TORCH_API at::Tensor & cudnn_convolution_transpose_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32); +TORCH_API at::Tensor & cudnn_convolution_transpose_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1edd5a8a89ec70e7c3adfdfa1f740abbc8123464 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_transpose_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cudnn_convolution_transpose { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cudnn_convolution_transpose"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "cudnn_convolution_transpose(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32); +}; + +struct TORCH_API cudnn_convolution_transpose_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, bool, bool, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::cudnn_convolution_transpose"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "cudnn_convolution_transpose.out(Tensor self, Tensor weight, SymInt[] padding, SymInt[] output_padding, SymInt[] stride, SymInt[] dilation, SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9c728c43ee71d19c441bd6cabeb619ecb37ecd3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & cudnn_grid_sampler_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & grid); +TORCH_API at::Tensor & cudnn_grid_sampler_outf(const at::Tensor & self, const at::Tensor & grid, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d856e4d1a6ed6f69b6755453de6b39c82f08ad97 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor cudnn_grid_sampler(const at::Tensor & self, const at::Tensor & grid); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5c31dde88c8cc6ff08ef57849a8b91fa994a07dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cumprod_backward_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor cumprod_backward(const at::Tensor & grad, const at::Tensor & input, int64_t dim, const at::Tensor & output); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad.h new file mode 100644 index 0000000000000000000000000000000000000000..717b49de62543603a775729fef5bea29cb4230df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::deg2rad(Tensor self) -> Tensor +inline at::Tensor deg2rad(const at::Tensor & self) { + return at::_ops::deg2rad::call(self); +} + +// aten::deg2rad_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & deg2rad_(at::Tensor & self) { + return at::_ops::deg2rad_::call(self); +} + +// aten::deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & deg2rad_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::deg2rad_out::call(self, out); +} +// aten::deg2rad.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & deg2rad_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::deg2rad_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ac9bda3e7365dac016a04475e29a4b5548c8e10d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/deg2rad_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor deg2rad(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_(at::Tensor & self); +TORCH_API at::Tensor deg2rad_sparse(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_sparse_(at::Tensor & self); +TORCH_API at::Tensor deg2rad_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & deg2rad_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & deg2rad_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dense_dim_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dense_dim_native.h new file mode 100644 index 0000000000000000000000000000000000000000..800e91b31c537c654ddbe8391e8b2a688049d1bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dense_dim_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API int64_t dense_dim_default(const at::Tensor & self); +TORCH_API int64_t dense_dim_sparse(const at::Tensor & self); +TORCH_API int64_t dense_dim_sparse_csr(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/det_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/det_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d7713245056e10c21701b9a4febf6065ccd0575c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/det_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API det { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::det"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "det(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/detach_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/detach_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0acd2c8da3462a7e359f9c58ab13e33c14f8aea6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/detach_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & detach_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor detach_copy(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diag_embed_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diag_embed_native.h new file mode 100644 index 0000000000000000000000000000000000000000..fe6aac3dd72c83fdff53d9b54fa0b9b7c69fa987 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diag_embed_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & diag_embed_out(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out); +TORCH_API at::Tensor diag_embed(const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat.h new file mode 100644 index 0000000000000000000000000000000000000000..cc2144ed54647796567398e1773453bcfbff11f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::diagflat(Tensor self, int offset=0) -> Tensor +inline at::Tensor diagflat(const at::Tensor & self, int64_t offset=0) { + return at::_ops::diagflat::call(self, offset); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d7f089c31b9a7a2bc910787e1e2a11b9f3ce16a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagflat_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor diagflat(const at::Tensor & self, int64_t offset=0); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal.h new file mode 100644 index 0000000000000000000000000000000000000000..735807d0e05e68cb2971a618c9284766359320c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::diagonal(Tensor(a) self, int offset=0, int dim1=0, int dim2=1) -> Tensor(a) +inline at::Tensor diagonal(const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1) { + return at::_ops::diagonal::call(self, offset, dim1, dim2); +} + +// aten::diagonal.Dimname(Tensor(a) self, *, Dimname outdim, Dimname dim1, Dimname dim2, int offset=0) -> Tensor(a) +inline at::Tensor diagonal(const at::Tensor & self, at::Dimname outdim, at::Dimname dim1, at::Dimname dim2, int64_t offset=0) { + return at::_ops::diagonal_Dimname::call(self, outdim, dim1, dim2, offset); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5392c9721e9270260bad7c9bca9d8d3c8b7a7a69 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & diagonal_scatter_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t offset=0, int64_t dim1=0, int64_t dim2=1); +TORCH_API at::Tensor & diagonal_scatter_outf(const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/digamma_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/digamma_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..69a2a0bd0a08e3846ca308ca1ee0dc9f2c06c630 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/digamma_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API digamma_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::digamma_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "digamma_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API digamma_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::digamma"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "digamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +struct TORCH_API digamma { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::digamma"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "digamma(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dist_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dist_native.h new file mode 100644 index 0000000000000000000000000000000000000000..18f35fb315fed3ccb409103b56e218bbf99c1b90 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dist_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor dist(const at::Tensor & self, const at::Tensor & other, const at::Scalar & p=2); +TORCH_API at::Tensor & dist_out(const at::Tensor & self, const at::Tensor & other, const at::Scalar & p, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dot.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dot.h new file mode 100644 index 0000000000000000000000000000000000000000..6ea8c35534d0902371f8f13362fee421c293bca0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dot.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::dot(Tensor self, Tensor tensor) -> Tensor +inline at::Tensor dot(const at::Tensor & self, const at::Tensor & tensor) { + return at::_ops::dot::call(self, tensor); +} + +// aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & dot_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor) { + return at::_ops::dot_out::call(self, tensor, out); +} +// aten::dot.out(Tensor self, Tensor tensor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & dot_outf(const at::Tensor & self, const at::Tensor & tensor, at::Tensor & out) { + return at::_ops::dot_out::call(self, tensor, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dsplit_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dsplit_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b0d6c3faed375e5b7269278b1bd9e8d9aaaea7f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/dsplit_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector dsplit(const at::Tensor & self, int64_t sections); +TORCH_API ::std::vector dsplit(const at::Tensor & self, at::IntArrayRef indices); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/elu_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/elu_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2f5fc16279306cac7921d3676d1f94f7aa115c03 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/elu_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor elu_backward(const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/elu_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/elu_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a6ea1a9b60c8f126f9c793f0e4f64049c550bd9b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/elu_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor elu(const at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1); +TORCH_API at::Tensor & elu_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1); +TORCH_API at::Tensor & elu_outf(const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, at::Tensor & out); +TORCH_API at::Tensor & elu_(at::Tensor & self, const at::Scalar & alpha=1, const at::Scalar & scale=1, const at::Scalar & input_scale=1); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cd819d0be0b7e71b3ffedf44dfb23aeab1a81a74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_bag_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple embedding_bag(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq=false, int64_t mode=0, bool sparse=false, const ::std::optional & per_sample_weights={}, bool include_last_offset=false); +TORCH_API ::std::tuple embedding_bag(const at::Tensor & weight, const at::Tensor & indices, const at::Tensor & offsets, bool scale_grad_by_freq, int64_t mode, bool sparse, const ::std::optional & per_sample_weights, bool include_last_offset, ::std::optional padding_idx); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..21713e5d3a9f9fd9dcd69e96c59494e5f26c2e61 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor embedding_symint(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor & embedding_out(at::Tensor & out, const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor & embedding_outf(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); +TORCH_API at::Tensor & embedding_symint_out(at::Tensor & out, const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx=-1, bool scale_grad_by_freq=false, bool sparse=false); +TORCH_API at::Tensor & embedding_symint_outf(const at::Tensor & weight, const at::Tensor & indices, c10::SymInt padding_idx, bool scale_grad_by_freq, bool sparse, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3d2b5e43f834809d27a3eac122abc30aab84e672 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); +TORCH_API at::Tensor embedding_dense_backward_symint(const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..bc92ebcd6a0def6aec7ebbe1ab423c50f147274d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/embedding_dense_backward_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq); +TORCH_API at::Tensor embedding_dense_backward_symint(const at::Tensor & grad_output, const at::Tensor & indices, c10::SymInt num_weights, c10::SymInt padding_idx, bool scale_grad_by_freq); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_like.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_like.h new file mode 100644 index 0000000000000000000000000000000000000000..7b8982b69d3496f5ddc131277b4b8397f4fef01c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_like.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt) { + return at::_ops::empty_like::call(self, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +// aten::empty_like(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor +inline at::Tensor empty_like(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::empty_like::call(self, dtype, layout, device, pin_memory, memory_format); +} + +// aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & empty_like_out(at::Tensor & out, const at::Tensor & self, ::std::optional memory_format=::std::nullopt) { + return at::_ops::empty_like_out::call(self, memory_format, out); +} +// aten::empty_like.out(Tensor self, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & empty_like_outf(const at::Tensor & self, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::empty_like_out::call(self, memory_format, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_native.h new file mode 100644 index 0000000000000000000000000000000000000000..980e4eba6da46cd20ea1a4a0b956a96ea48902c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_native.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor empty_names(at::IntArrayRef size, ::std::optional names, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor & empty_names_out(at::IntArrayRef size, ::std::optional names, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor & empty_out(at::IntArrayRef size, ::std::optional memory_format, at::Tensor & out); +TORCH_API at::Tensor empty_cpu(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_cuda(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_meta_symint(c10::SymIntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_sparse(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_sparse_compressed(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_sparse_compressed_symint(c10::SymIntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_sparse_symint(c10::SymIntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_mkldnn(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor empty_unknown_quantized(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}, ::std::optional memory_format=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_quantized_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_quantized_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..67383f4182e3e68c8a4585a8321ca5d2b60b4d88 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/empty_quantized_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API empty_quantized { + using schema = at::Tensor (at::IntArrayRef, const at::Tensor &, ::std::optional, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::empty_quantized"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "empty_quantized(int[] size, Tensor qtensor, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor"; + static at::Tensor call(at::IntArrayRef size, const at::Tensor & qtensor, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & qtensor, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); +}; + +struct TORCH_API empty_quantized_out { + using schema = at::Tensor & (at::IntArrayRef, const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::empty_quantized"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "empty_quantized.out(int[] size, Tensor qtensor, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(at::IntArrayRef size, const at::Tensor & qtensor, ::std::optional memory_format, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::IntArrayRef size, const at::Tensor & qtensor, ::std::optional memory_format, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..39c08c8b876b7adde4a22ce6eaa180d0ee2beac0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor erfinv(const at::Tensor & self); +TORCH_API at::Tensor & erfinv_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & erfinv_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & erfinv_(at::Tensor & self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bfef84efa59cd02e3627d1992ce4e4198760198c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/erfinv_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API erfinv { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::erfinv"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "erfinv(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API erfinv_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::erfinv_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "erfinv_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API erfinv_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::erfinv"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp2_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp2_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..81c20d6e30fe5396ad1e0e38400bd0b84522adf2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp2_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor exp2(const at::Tensor & self); +TORCH_API at::Tensor & exp2_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp2_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp2_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce09b1be18ae3e7d45413c0b99613a333a1b5549 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor exp(const at::Tensor & self); +TORCH_API at::Tensor & exp_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..af60ecafab90e9221ecc1bcf37d99840026c7cdb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor exp(const at::Tensor & self); +TORCH_API at::Tensor & exp_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/expand_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/expand_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..581df8040dd9513700019cee8f345edd61652336 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/expand_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API expand_copy { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::expand_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "expand_copy(Tensor self, SymInt[] size, *, bool implicit=False) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit); +}; + +struct TORCH_API expand_copy_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::expand_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "expand_copy.out(Tensor self, SymInt[] size, *, bool implicit=False, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/expm1_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/expm1_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0964acdd234bfb461002aa5e2a762c87aea08e35 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/expm1_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor expm1(const at::Tensor & self); +TORCH_API at::Tensor & expm1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & expm1_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & expm1_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4bfe980765965238ab75a62d21e1e29654b882e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fake_quantize_per_channel_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ff4058232fb05b34ede10e0fab94fe5fd9eddf26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fake_quantize_per_tensor_affine_cachemask_backward(const at::Tensor & grad, const at::Tensor & mask); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8d2f3ddfb40da53685803a233d3f4dedc119694a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_cachemask_backward_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fake_quantize_per_tensor_affine_cachemask_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fake_quantize_per_tensor_affine_cachemask_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fake_quantize_per_tensor_affine_cachemask_backward(Tensor grad, Tensor mask) -> Tensor"; + static at::Tensor call(const at::Tensor & grad, const at::Tensor & mask); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & mask); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8979505e2c05a6d54f77cb6df814a9f7b3395b29 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fake_quantize_per_tensor_affine(const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max); +TORCH_API at::Tensor fake_quantize_per_tensor_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_native.h new file mode 100644 index 0000000000000000000000000000000000000000..28d81d24da0e4de38ed313f2557adeccb4cfccc5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fake_quantize_per_tensor_affine_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fake_quantize_per_tensor_affine(const at::Tensor & self, double scale, int64_t zero_point, int64_t quant_min, int64_t quant_max); +TORCH_API at::Tensor fake_quantize_per_tensor_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation.h new file mode 100644 index 0000000000000000000000000000000000000000..e6a9e94d85d885bd05800ecb08f6898c34ab4f1c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor +inline at::Tensor fbgemm_linear_fp16_weight_fp32_activation(const at::Tensor & input, const at::Tensor & packed_weight, const ::std::optional & bias) { + return at::_ops::fbgemm_linear_fp16_weight_fp32_activation::call(input, packed_weight, bias); +} + +// aten::fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor +inline at::Tensor fbgemm_linear_fp16_weight_fp32_activation(const at::Tensor & input, const at::Tensor & packed_weight, const ::std::optional & bias, at::Tensor & output) { + return at::_ops::fbgemm_linear_fp16_weight_fp32_activation_out::call(input, packed_weight, bias, output); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..86e2fd43781caf736111ac9d4d0fd8a6e7003cb6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_fp16_weight_fp32_activation_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fbgemm_linear_fp16_weight_fp32_activation(const at::Tensor & input, const at::Tensor & packed_weight, const ::std::optional & bias); +TORCH_API at::Tensor fbgemm_linear_fp16_weight_fp32_activation(const at::Tensor & input, const at::Tensor & packed_weight, const ::std::optional & bias, at::Tensor & output); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ad51544fb2f90b36625760d19445bf7bfc6552 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fbgemm_linear_int8_weight(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h new file mode 100644 index 0000000000000000000000000000000000000000..3c47277ef771aef0358daa7b496992f26be68f83 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_fp32_activation.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fbgemm_linear_int8_weight_fp32_activation(Tensor input, Tensor weight, Tensor packed, Tensor col_offsets, Scalar weight_scale, Scalar weight_zero_point, Tensor bias) -> Tensor +inline at::Tensor fbgemm_linear_int8_weight_fp32_activation(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias) { + return at::_ops::fbgemm_linear_int8_weight_fp32_activation::call(input, weight, packed, col_offsets, weight_scale, weight_zero_point, bias); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d948c208033b013bc7e524a59e6b280b588d12b2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fbgemm_pack_gemm_matrix_fp16 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fbgemm_pack_gemm_matrix_fp16"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor"; + static at::Tensor call(const at::Tensor & input); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_quantized_matrix_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_quantized_matrix_native.h new file mode 100644 index 0000000000000000000000000000000000000000..5a7cec4862145d98830fb8ea80ca7fa7377521dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fbgemm_pack_quantized_matrix_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fbgemm_pack_quantized_matrix(const at::Tensor & input); +TORCH_API at::Tensor fbgemm_pack_quantized_matrix(const at::Tensor & input, int64_t K, int64_t N); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/feature_alpha_dropout_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/feature_alpha_dropout_native.h new file mode 100644 index 0000000000000000000000000000000000000000..718b676ef5933630bacb8e5c09bb7e35462973d9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/feature_alpha_dropout_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor feature_alpha_dropout(const at::Tensor & input, double p, bool train); +TORCH_API at::Tensor & feature_alpha_dropout_(at::Tensor & self, double p, bool train); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/feature_dropout_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/feature_dropout_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..958d13e61d6650d02a6df5cf68e62b46c852ef19 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/feature_dropout_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API feature_dropout { + using schema = at::Tensor (const at::Tensor &, double, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::feature_dropout"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "feature_dropout(Tensor input, float p, bool train) -> Tensor"; + static at::Tensor call(const at::Tensor & input, double p, bool train); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train); +}; + +struct TORCH_API feature_dropout_ { + using schema = at::Tensor & (at::Tensor &, double, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::feature_dropout_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, double p, bool train); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9a16adcc54e01f8b63f407c8f8168312b4c154d1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fft_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fft_fft(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor fft_fft_symint(const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_fft_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_fft_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out); +TORCH_API at::Tensor & fft_fft_symint_out(at::Tensor & out, const at::Tensor & self, ::std::optional n=::std::nullopt, int64_t dim=-1, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_fft_symint_outf(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftshift_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftshift_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ff6019ee718a61dbc339696f9eb50548a853740a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_fftshift_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor fft_fftshift(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ifft_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ifft_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8215d4f7225352dfff71e5a750f2e7bb9f50bba0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ifft_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fft_ifft { + using schema = at::Tensor (const at::Tensor &, ::std::optional, int64_t, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_ifft"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fft_ifft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm); +}; + +struct TORCH_API fft_ifft_out { + using schema = at::Tensor & (const at::Tensor &, ::std::optional, int64_t, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_ifft"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "fft_ifft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional n, int64_t dim, ::std::optional norm, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft2_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft2_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..aadc93a59ef11617e7361c5d3e92b431a1964241 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_ihfft2_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fft_ihfft2 { + using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_ihfft2"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fft_ihfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm); +}; + +struct TORCH_API fft_ihfft2_out { + using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fft_ihfft2"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "fft_ihfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2.h new file mode 100644 index 0000000000000000000000000000000000000000..023eecf1e816c9230adcd8645a71a2de15da30d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +inline at::Tensor fft_irfft2(const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2::call(self, s.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*s)) : ::std::nullopt, dim, norm); +} +namespace symint { + template >> + at::Tensor fft_irfft2(const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2::call(self, s.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*s)) : ::std::nullopt, dim, norm); + } +} + +// aten::fft_irfft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor +inline at::Tensor fft_irfft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2::call(self, s, dim, norm); +} +namespace symint { + template >> + at::Tensor fft_irfft2(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2::call(self, s, dim, norm); + } +} + +// aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_irfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2_out::call(self, s.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*s)) : ::std::nullopt, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_irfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2_out::call(self, s.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*s)) : ::std::nullopt, dim, norm, out); + } +} + +// aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_irfft2_outf(const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_irfft2_out::call(self, s.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*s)) : ::std::nullopt, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_irfft2_outf(const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_irfft2_out::call(self, s.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*s)) : ::std::nullopt, dim, norm, out); + } +} + +// aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_irfft2_symint_out(at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2_out::call(self, s, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_irfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt) { + return at::_ops::fft_irfft2_out::call(self, s, dim, norm, out); + } +} + +// aten::fft_irfft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fft_irfft2_symint_outf(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_irfft2_out::call(self, s, dim, norm, out); +} +namespace symint { + template >> + at::Tensor & fft_irfft2_outf(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out) { + return at::_ops::fft_irfft2_out::call(self, s, dim, norm, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e03c4034fc8e6c0a155d4ba3ee9aef3430d01122 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fft_irfft2_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fft_irfft2(const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor fft_irfft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_irfft2_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_irfft2_outf(const at::Tensor & self, at::OptionalIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); +TORCH_API at::Tensor & fft_irfft2_symint_out(at::Tensor & out, const at::Tensor & self, at::OptionalSymIntArrayRef s=::std::nullopt, at::IntArrayRef dim={-2,-1}, ::std::optional norm=::std::nullopt); +TORCH_API at::Tensor & fft_irfft2_symint_outf(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, ::std::optional norm, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fill_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fill_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2a178007ae0bef0a0d7d4fe86c03ee38b4570cfc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fill_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fill_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "fill.Scalar(Tensor self, Scalar value) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & value); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & value); +}; + +struct TORCH_API fill_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "fill.Tensor(Tensor self, Tensor value) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & value); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & value); +}; + +struct TORCH_API fill__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & value); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & value); +}; + +struct TORCH_API fill__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "fill_.Tensor(Tensor(a!) self, Tensor value) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & value); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & value); +}; + +struct TORCH_API fill_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "fill.Scalar_out(Tensor self, Scalar value, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & value, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & value, at::Tensor & out); +}; + +struct TORCH_API fill_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fill"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "fill.Tensor_out(Tensor self, Tensor value, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & value, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & value, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fix.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fix.h new file mode 100644 index 0000000000000000000000000000000000000000..3c917af164bc17888292e56ce03af3b138657cef --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fix.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fix(Tensor self) -> Tensor +inline at::Tensor fix(const at::Tensor & self) { + return at::_ops::fix::call(self); +} + +// aten::fix_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & fix_(at::Tensor & self) { + return at::_ops::fix_::call(self); +} + +// aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fix_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::fix_out::call(self, out); +} +// aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & fix_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::fix_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten.h new file mode 100644 index 0000000000000000000000000000000000000000..d89b63e12551068056994cec53f6a5db909d90ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten.h @@ -0,0 +1,51 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, int64_t start_dim=0, int64_t end_dim=-1) { + return at::_ops::flatten_using_ints::call(self, start_dim, end_dim); +} + +// aten::flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, int64_t start_dim, int64_t end_dim, at::Dimname out_dim) { + return at::_ops::flatten_named_out_dim::call(self, start_dim, end_dim, out_dim); +} + +// aten::flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim) { + return at::_ops::flatten_using_names::call(self, start_dim, end_dim, out_dim); +} + +// aten::flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a) +inline at::Tensor flatten(const at::Tensor & self, at::DimnameList dims, at::Dimname out_dim) { + return at::_ops::flatten_DimnameList::call(self, dims, out_dim); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten_dense_tensors.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten_dense_tensors.h new file mode 100644 index 0000000000000000000000000000000000000000..ed32a029343bb11c746d11121fc5b0500f6827fa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten_dense_tensors.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::flatten_dense_tensors(Tensor[] tensors) -> Tensor +inline at::Tensor flatten_dense_tensors(at::TensorList tensors) { + return at::_ops::flatten_dense_tensors::call(tensors); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten_dense_tensors_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten_dense_tensors_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c8c79927f6ff5d172aed07ec7fa1603861cad471 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flatten_dense_tensors_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API flatten_dense_tensors { + using schema = at::Tensor (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::flatten_dense_tensors"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "flatten_dense_tensors(Tensor[] tensors) -> Tensor"; + static at::Tensor call(at::TensorList tensors); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fliplr_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fliplr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9996687ec8659611de6293a87efed24bace6d0ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fliplr_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fliplr { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::fliplr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "fliplr(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flipud.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flipud.h new file mode 100644 index 0000000000000000000000000000000000000000..69b8fe8be6a5f9937f0b386e65c0da324c079e25 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/flipud.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::flipud(Tensor self) -> Tensor +inline at::Tensor flipud(const at::Tensor & self) { + return at::_ops::flipud::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/float_power_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/float_power_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..219150037a88b12134da227eb244d9beb06f35e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/float_power_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API float_power_Tensor_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power"; + static constexpr const char* overload_name = "Tensor_Tensor_out"; + static constexpr const char* schema_str = "float_power.Tensor_Tensor_out(Tensor self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & exponent, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & exponent, at::Tensor & out); +}; + +struct TORCH_API float_power_Tensor_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power"; + static constexpr const char* overload_name = "Tensor_Tensor"; + static constexpr const char* schema_str = "float_power.Tensor_Tensor(Tensor self, Tensor exponent) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & exponent); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & exponent); +}; + +struct TORCH_API float_power_Scalar_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power"; + static constexpr const char* overload_name = "Scalar_out"; + static constexpr const char* schema_str = "float_power.Scalar_out(Scalar self, Tensor exponent, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & self, const at::Tensor & exponent, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & exponent, at::Tensor & out); +}; + +struct TORCH_API float_power_Scalar { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "float_power.Scalar(Scalar self, Tensor exponent) -> Tensor"; + static at::Tensor call(const at::Scalar & self, const at::Tensor & exponent); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & exponent); +}; + +struct TORCH_API float_power_Tensor_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power"; + static constexpr const char* overload_name = "Tensor_Scalar_out"; + static constexpr const char* schema_str = "float_power.Tensor_Scalar_out(Tensor self, Scalar exponent, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & exponent, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & exponent, at::Tensor & out); +}; + +struct TORCH_API float_power_Tensor_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power"; + static constexpr const char* overload_name = "Tensor_Scalar"; + static constexpr const char* schema_str = "float_power.Tensor_Scalar(Tensor self, Scalar exponent) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & exponent); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & exponent); +}; + +struct TORCH_API float_power__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "float_power_.Scalar(Tensor(a!) self, Scalar exponent) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & exponent); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & exponent); +}; + +struct TORCH_API float_power__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::float_power_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "float_power_.Tensor(Tensor(a!) self, Tensor exponent) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & exponent); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & exponent); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/floor_divide_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/floor_divide_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e2b21b83d0f7e6ba903dc5ac21a6edb21b264d35 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/floor_divide_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor floor_divide(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & floor_divide_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & floor_divide_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & floor_divide_(at::Tensor & self, const at::Tensor & other); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/floor_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/floor_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..85eac190169e8af81718122ae87133f354fcbd1c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/floor_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_floor : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fmax_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fmax_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..d0a657a67dc9f860da8592bb9f78a065d9a6aa5a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fmax_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_fmax : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fmin_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fmin_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ed6300a0310481d6ac8ad158d95eece9966416b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fmin_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor fmin(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & fmin_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & fmin_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frac_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frac_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a08bed99a125b47ad617c45d1a24aed93db85366 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frac_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor frac(const at::Tensor & self); +TORCH_API at::Tensor & frac_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & frac_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & frac_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..814915ee21a8bfa8adb5d7696933cddfe292f2b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/fractional_max_pool2d_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & fractional_max_pool2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices) { + return at::_ops::fractional_max_pool2d_backward_grad_input::call(grad_output, self, kernel_size, output_size, indices, grad_input); +} +// aten::fractional_max_pool2d_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & fractional_max_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices, at::Tensor & grad_input) { + return at::_ops::fractional_max_pool2d_backward_grad_input::call(grad_output, self, kernel_size, output_size, indices, grad_input); +} + +// aten::fractional_max_pool2d_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] output_size, Tensor indices) -> Tensor +inline at::Tensor fractional_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & indices) { + return at::_ops::fractional_max_pool2d_backward::call(grad_output, self, kernel_size, output_size, indices); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frexp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frexp.h new file mode 100644 index 0000000000000000000000000000000000000000..22b55f210854605a04cef310f1a2909da9e076fb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frexp.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent) +inline ::std::tuple frexp(const at::Tensor & self) { + return at::_ops::frexp_Tensor::call(self); +} + +// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) +inline ::std::tuple frexp_out(at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self) { + return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent); +} +// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) +inline ::std::tuple frexp_outf(const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent) { + return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frexp_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frexp_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..34605083b590d1af48cf60e14495ae282daa2f35 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/frexp_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple frexp_out(at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self); +TORCH_API ::std::tuple frexp_outf(const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/from_blob.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/from_blob.h new file mode 100644 index 0000000000000000000000000000000000000000..aa2ff74cf3557c6cdc5aa3e08e475f0c0cebdf21 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/from_blob.h @@ -0,0 +1,172 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +#include + +namespace at { + +namespace detail { + +inline void noopDelete(void* /*unused*/) {} + +} // namespace detail + +/// Provides a fluent API to construct tensors from external data. +/// +/// The fluent API can be used instead of `from_blob` functions in case the +/// required set of parameters does not align with the existing overloads. +/// +/// at::Tensor tensor = at::for_blob(data, sizes) +/// .strides(strides) +/// .context(context, [](void *ctx) { delete static_cast(ctx); +/// }) .options(...) .make_tensor(); +/// +class TORCH_API TensorMaker { + friend TensorMaker for_blob(void* data, IntArrayRef sizes) noexcept; + + public: + using ContextDeleter = DeleterFnPtr; + + TensorMaker& strides(OptionalIntArrayRef value) noexcept { + strides_ = value; + + return *this; + } + + TensorMaker& storage_offset(std::optional value) noexcept { + storage_offset_ = value; + + return *this; + } + + TensorMaker& deleter(std::function value) noexcept { + deleter_ = std::move(value); + + return *this; + } + + TensorMaker& context(void* value, ContextDeleter deleter = nullptr) noexcept { + ctx_ = std::unique_ptr{ + value, deleter != nullptr ? deleter : detail::noopDelete}; + + return *this; + } + + TensorMaker& target_device(std::optional value) noexcept { + device_ = value; + + return *this; + } + + TensorMaker& options(TensorOptions value) noexcept { + opts_ = value; + + return *this; + } + + TensorMaker& resizeable_storage() noexcept { + resizeable_ = true; + + return *this; + } + + TensorMaker& allocator(c10::Allocator* allocator) noexcept { + allocator_ = allocator; + + return *this; + } + + Tensor make_tensor(); + + private: + explicit TensorMaker(void* data, IntArrayRef sizes) noexcept + : data_{data}, sizes_{sizes} {} + + std::size_t computeStorageSize() const noexcept; + + DataPtr makeDataPtrFromDeleter() noexcept; + + DataPtr makeDataPtrFromContext() noexcept; + + IntArrayRef makeTempSizes() const noexcept; + + void* data_; + IntArrayRef sizes_; + OptionalIntArrayRef strides_; + std::optional storage_offset_; + std::function deleter_; + std::unique_ptr ctx_{nullptr, detail::noopDelete}; + std::optional device_; + TensorOptions opts_; + bool resizeable_{}; + c10::Allocator* allocator_{}; +}; + +inline TensorMaker for_blob(void* data, IntArrayRef sizes) noexcept { + return TensorMaker{data, sizes}; +} + +inline Tensor from_blob( + void* data, + IntArrayRef sizes, + IntArrayRef strides, + const std::function& deleter, + const TensorOptions& options = {}, + const std::optional target_device = std::nullopt) { + return for_blob(data, sizes) + .strides(strides) + .deleter(deleter) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +inline Tensor from_blob( + void* data, + IntArrayRef sizes, + IntArrayRef strides, + int64_t storage_offset, + const std::function& deleter, + const TensorOptions& options = {}, + const std::optional target_device = std::nullopt) { + return for_blob(data, sizes) + .strides(strides) + .storage_offset(storage_offset) + .deleter(deleter) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +inline Tensor from_blob( + void* data, + IntArrayRef sizes, + std::function deleter, + const TensorOptions& options = {}, + const std::optional target_device = std::nullopt) { + return for_blob(data, sizes) + .deleter(std::move(deleter)) + .options(options) + .target_device(target_device) + .make_tensor(); +} + +inline Tensor from_blob( + void* data, + IntArrayRef sizes, + IntArrayRef strides, + const TensorOptions& options = {}) { + return for_blob(data, sizes).strides(strides).options(options).make_tensor(); +} + +inline Tensor from_blob( + void* data, + IntArrayRef sizes, + const TensorOptions& options = {}) { + return for_blob(data, sizes).options(options).make_tensor(); +} + +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/from_file_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/from_file_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f912176ff80ad4d197211e029585329b48cfa541 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/from_file_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor from_file(c10::string_view filename, ::std::optional shared=::std::nullopt, ::std::optional size=0, at::TensorOptions options={}); +TORCH_API at::Tensor from_file(c10::string_view filename, ::std::optional shared, ::std::optional size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2938c5e62e0737ec7e0c9dcf46c57c8ad1af73ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor gather(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e238c6e8b180f7bbf711a58620e369c4ae3bd5ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gather_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor gather(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad=false); +TORCH_API at::Tensor & gather_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad=false); +TORCH_API at::Tensor & gather_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd.h new file mode 100644 index 0000000000000000000000000000000000000000..12a67566d116712e92509bc1064b85e6cecde599 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & gcd_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::gcd_out::call(self, other, out); +} +// aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & gcd_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::gcd_out::call(self, other, out); +} + +// aten::gcd(Tensor self, Tensor other) -> Tensor +inline at::Tensor gcd(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::gcd::call(self, other); +} + +// aten::gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) +inline at::Tensor & gcd_(at::Tensor & self, const at::Tensor & other) { + return at::_ops::gcd_::call(self, other); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7be92c5049ece3c632895a1c4ed67de848dea8af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gcd_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor gcd(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gcd_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ge_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ge_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..06ff45e596f489f0d344b239a01be0aa6f4f5808 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ge_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_ge_Scalar : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & other); +}; +struct TORCH_API structured_ge_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..d05a76c3b5c04dd8fc85eabf11f6108441738663 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & gelu_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") { + return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input); +} +// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & gelu_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate, at::Tensor & grad_input) { + return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input); +} + +// aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor +inline at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") { + return at::_ops::gelu_backward::call(grad_output, self, approximate); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f16ee55207fe5144374f0d90d7c9c00098e179f4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gelu_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none"); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/geometric_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/geometric_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..44d0e03a88a5a47b0cf8d6e1d4dd0500535b5259 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/geometric_compositeexplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor geometric(const at::Tensor & self, double p, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & geometric_out(at::Tensor & out, const at::Tensor & self, double p, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & geometric_outf(const at::Tensor & self, double p, ::std::optional generator, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_jvp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_jvp.h new file mode 100644 index 0000000000000000000000000000000000000000..70862c6182ef346df544f28c5446df64105bc1ad --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_jvp.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::glu_backward_jvp(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim) -> Tensor +inline at::Tensor glu_backward_jvp(const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim) { + return at::_ops::glu_backward_jvp::call(grad_x, grad_glu, x, dgrad_glu, dx, dim); +} + +// aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & glu_backward_jvp_out(at::Tensor & out, const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim) { + return at::_ops::glu_backward_jvp_out::call(grad_x, grad_glu, x, dgrad_glu, dx, dim, out); +} +// aten::glu_backward_jvp.out(Tensor grad_x, Tensor grad_glu, Tensor x, Tensor dgrad_glu, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & glu_backward_jvp_outf(const at::Tensor & grad_x, const at::Tensor & grad_glu, const at::Tensor & x, const at::Tensor & dgrad_glu, const at::Tensor & dx, int64_t dim, at::Tensor & out) { + return at::_ops::glu_backward_jvp_out::call(grad_x, grad_glu, x, dgrad_glu, dx, dim, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ef97a4d58e1f5f749fed6e54102a19b84b89774e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API glu_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::glu_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, at::Tensor & grad_input); +}; + +struct TORCH_API glu_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::glu_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dd74bb73e29bf7b051c2bb30b7cf4531c0bc571b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor glu(const at::Tensor & self, int64_t dim=-1); +TORCH_API at::Tensor & glu_out(at::Tensor & out, const at::Tensor & self, int64_t dim=-1); +TORCH_API at::Tensor & glu_outf(const at::Tensor & self, int64_t dim, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9a9f41651f87ea1220457197a400d69fe5e53bbd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & glu_jvp_out(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f1666e15e5a59e3e38043ec6a66e8367d4d758ac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/glu_jvp_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API glu_jvp { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::glu_jvp"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor"; + static at::Tensor call(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim); +}; + +struct TORCH_API glu_jvp_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::glu_jvp"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "glu_jvp.out(Tensor glu, Tensor x, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1ffda6d63ff0543a0c33801f1163e44be2f0d70e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple grid_sampler_2d_backward_out(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); +TORCH_API ::std::tuple grid_sampler_2d_backward_cpu(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +TORCH_API ::std::tuple grid_sampler_2d_backward_cuda(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3954f2e24dccf31f52aef1fa893eba5bd7132abf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_2d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API grid_sampler_2d_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, bool, ::std::array); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::grid_sampler_2d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "grid_sampler_2d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +}; + +struct TORCH_API grid_sampler_2d_backward_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, bool, ::std::array, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::grid_sampler_2d_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "grid_sampler_2d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6e416e7b0be2c0ee4edf360d857267c79c491df3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple grid_sampler_3d_backward_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +TORCH_API ::std::tuple grid_sampler_3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..03cd082c92d082da19008d03b02a5528e5c4704a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_3d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API grid_sampler_3d_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, bool, ::std::array); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::grid_sampler_3d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "grid_sampler_3d_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask) -> (Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask); +}; + +struct TORCH_API grid_sampler_3d_backward_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, bool, ::std::array, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::grid_sampler_3d_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "grid_sampler_3d_backward.out(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners, bool[2] output_mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array output_mask, at::Tensor & out0, at::Tensor & out1); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b6432269a09d1def017226ac14fd0739c05aa144 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/grid_sampler_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor grid_sampler(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gt_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gt_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c38a6121c6ebb79f14bcd0738d2a7f647fc3956d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gt_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor gt(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & gt_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor gt(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gt_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gt_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gt_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f37e32f12787c015e52c7db3f936d25d35928702 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/gt_native.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_gt_Scalar_out : public at::meta::structured_gt_Scalar { +void impl(const at::Tensor & self, const at::Scalar & other, const at::Tensor & out); +}; +TORCH_API at::Tensor gt_scalar_nested(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor gt_quantized_cpu(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & gt_out_quantized_cpu(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +struct TORCH_API structured_gt_Tensor_out : public at::meta::structured_gt_Tensor { +void impl(const at::Tensor & self, const at::Tensor & other, const at::Tensor & out); +}; +TORCH_API at::Tensor gt_quantized_cpu(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & gt_out_quantized_cpu(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6dc597d85aa87e01bfd55aae23e4414d44d6ca24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardshrink_backward_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor hardshrink_backward(const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd); +TORCH_API at::Tensor & hardshrink_backward_out(at::Tensor & grad_input, const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd); +TORCH_API at::Tensor & hardshrink_backward_outf(const at::Tensor & grad_out, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & grad_input); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..e7fff6efbcb4c84d0c4429621f6c6d47e284e835 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardsigmoid_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_hardsigmoid : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..a6539197a2750b62b7d74fab7eeb586f65362d3c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hardswish_backward(Tensor grad_output, Tensor self) -> Tensor +inline at::Tensor hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::hardswish_backward::call(grad_output, self); +} + +// aten::hardswish_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & hardswish_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::hardswish_backward_out::call(grad_output, self, out); +} +// aten::hardswish_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & hardswish_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) { + return at::_ops::hardswish_backward_out::call(grad_output, self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a52f8a80932e8b70c47e75c5bac73971a385bc7e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f0cdc655726ab4e6f6fd107752870b2697951674 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hardtanh_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val); +TORCH_API at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val); +TORCH_API at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..517e3f05b8545e1a2a4b08eb990fde600993257c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hash_tensor_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor hash_tensor(const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false, int64_t mode=0); +TORCH_API at::Tensor & hash_tensor_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false, int64_t mode=0); +TORCH_API at::Tensor & hash_tensor_outf(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, int64_t mode, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/heaviside_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/heaviside_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..c32b4439bbf5fce5c7dd043a3fa1be1e51a511bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/heaviside_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_heaviside : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & values); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/histogram_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/histogram_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3d92573759460c4b12e9036c9673e5a394980137 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/histogram_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple histogram(const at::Tensor & self, const at::Tensor & bins, const ::std::optional & weight={}, bool density=false); +TORCH_API ::std::tuple histogram_out(const at::Tensor & self, const at::Tensor & bins, const ::std::optional & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges); +TORCH_API ::std::tuple histogram(const at::Tensor & self, int64_t bins=100, ::std::optional> range=::std::nullopt, const ::std::optional & weight={}, bool density=false); +TORCH_API ::std::tuple histogram_out(const at::Tensor & self, int64_t bins, ::std::optional> range, const ::std::optional & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit.h new file mode 100644 index 0000000000000000000000000000000000000000..9239df9a2785276c3ec6d2de7df6cc05b5a81f24 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[] +inline ::std::vector hsplit(const at::Tensor & self, int64_t sections) { + return at::_ops::hsplit_int::call(self, sections); +} + +// aten::hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[] +inline ::std::vector hsplit(const at::Tensor & self, at::IntArrayRef indices) { + return at::_ops::hsplit_array::call(self, indices); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2a209e5d64c4e3afb8afe164dd2f6e2e2dc87292 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hsplit_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API hsplit_int { + using schema = ::std::vector (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::hsplit"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "hsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, int64_t sections); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections); +}; + +struct TORCH_API hsplit_array { + using schema = ::std::vector (const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::hsplit"; + static constexpr const char* overload_name = "array"; + static constexpr const char* schema_str = "hsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]"; + static ::std::vector call(const at::Tensor & self, at::IntArrayRef indices); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hspmm_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hspmm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..cc51ecf489bd19bcd6b58c57feac6a4ece349a04 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/hspmm_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API hspmm_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::hspmm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "hspmm.out(Tensor mat1, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & mat1, const at::Tensor & mat2, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mat1, const at::Tensor & mat2, at::Tensor & out); +}; + +struct TORCH_API hspmm { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::hspmm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "hspmm(Tensor mat1, Tensor mat2) -> Tensor"; + static at::Tensor call(const at::Tensor & mat1, const at::Tensor & mat2); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & mat1, const at::Tensor & mat2); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss.h new file mode 100644 index 0000000000000000000000000000000000000000..f9afd897f7092f2f66e8052ef7827bec18d6399f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/huber_loss.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & huber_loss_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double delta=1.0) { + return at::_ops::huber_loss_out::call(self, target, reduction, delta, out); +} +// aten::huber_loss.out(Tensor self, Tensor target, int reduction=Mean, float delta=1.0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & huber_loss_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta, at::Tensor & out) { + return at::_ops::huber_loss_out::call(self, target, reduction, delta, out); +} + +// aten::huber_loss(Tensor self, Tensor target, int reduction=Mean, float delta=1.0) -> Tensor +inline at::Tensor huber_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double delta=1.0) { + return at::_ops::huber_loss::call(self, target, reduction, delta); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/i0_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/i0_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a5b8ef18587d3f8d59d13f0c0fe1295da5d9ad76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/i0_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor i0(const at::Tensor & self); +TORCH_API at::Tensor & i0_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/igammac_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/igammac_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8982137ca94681f40c48b2336a686e362377021c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/igammac_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor igammac(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & igammac_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/im2col_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/im2col_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4f2255e5e4b9d7c19dde5af1e90dab15048e4001 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/im2col_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor im2col(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & im2col_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +TORCH_API at::Tensor & im2col_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add.h new file mode 100644 index 0000000000000000000000000000000000000000..5b183a2659666b1d2085e57ea3dba12cd3893cec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & index_add_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) { + return at::_ops::index_add_out::call(self, dim, index, source, alpha, out); +} +// aten::index_add.out(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & index_add_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha, at::Tensor & out) { + return at::_ops::index_add_out::call(self, dim, index, source, alpha, out); +} + +// aten::index_add(Tensor self, int dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor +inline at::Tensor index_add(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) { + return at::_ops::index_add::call(self, dim, index, source, alpha); +} + +// aten::index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source, *, Scalar alpha=1) -> Tensor +inline at::Tensor index_add(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1) { + return at::_ops::index_add_dimname::call(self, dim, index, source, alpha); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8e3b767548c213164d1d4da355fd35570ee11146 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_add_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor index_add(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1); +TORCH_API at::Tensor & index_add_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1); +TORCH_API at::Tensor & index_add_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & index_add_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, const at::Scalar & alpha=1); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..965e9dcf569bff46609544f1b7b97b9d71f598d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_copy.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & index_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source) { + return at::_ops::index_copy_out::call(self, dim, index, source, out); +} +// aten::index_copy.out(Tensor self, int dim, Tensor index, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & index_copy_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, at::Tensor & out) { + return at::_ops::index_copy_out::call(self, dim, index, source, out); +} + +// aten::index_copy(Tensor self, int dim, Tensor index, Tensor source) -> Tensor +inline at::Tensor index_copy(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source) { + return at::_ops::index_copy::call(self, dim, index, source); +} + +// aten::index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor +inline at::Tensor index_copy(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & source) { + return at::_ops::index_copy_dimname::call(self, dim, index, source); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0344fb0328be55999266ca9b9e0833133403de77 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor index_select(const at::Tensor & self, at::Dimname dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_outf(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6bf02283793f1b07310f1783149f6e1ac846143f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3df244a9f544d493da23a90adfb5aeb2f5b30305 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/index_select_native.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor index_select_cpu_(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_out_cpu_(const at::Tensor & self, int64_t dim, const at::Tensor & index, at::Tensor & out); +TORCH_API at::Tensor index_select_cuda(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_out_cuda(const at::Tensor & self, int64_t dim, const at::Tensor & index, at::Tensor & out); +TORCH_API at::Tensor index_select_sparse_cpu(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor index_select_sparse_cuda(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor index_select_quantized_cpu_(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor index_select_quantized_cuda(const at::Tensor & self, int64_t dim, const at::Tensor & index); +TORCH_API at::Tensor index_select(const at::Tensor & self, at::Dimname dim, const at::Tensor & index); +TORCH_API at::Tensor & index_select_out(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/indices_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/indices_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0022a75353d1822f26d4f6d4b7ee2d6e14949aa2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/indices_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor indices(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b24561cf68aa63638f9835d85fafa8229255e4e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/infinitely_differentiable_gelu_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor infinitely_differentiable_gelu_backward(const at::Tensor & grad, const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/int_repr_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/int_repr_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..188db1c1994b510b88c4381c7dd59243fe19013e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/int_repr_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & int_repr_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & int_repr_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse.h new file mode 100644 index 0000000000000000000000000000000000000000..4fa54e59b1c1b3c31afa204821896d5226641d3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/inverse.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::inverse(Tensor self) -> Tensor +inline at::Tensor inverse(const at::Tensor & self) { + return at::_ops::inverse::call(self); +} + +// aten::inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & inverse_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::inverse_out::call(self, out); +} +// aten::inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & inverse_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::inverse_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_coalesced_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_coalesced_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..26e8edbd8245f90a02750b7822b9e7a449555a8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_coalesced_compositeexplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API bool is_coalesced(const at::Tensor & self); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_floating_point_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_floating_point_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4b3d2b27813586ec3978bf7b4397efcf601b4cca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_floating_point_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API bool is_floating_point(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8759c7f6b87405f3bcc8595581e327f84539d9b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_inference_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API is_inference { + using schema = bool (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::is_inference"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "is_inference(Tensor self) -> bool"; + static bool call(const at::Tensor & self); + static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_leaf_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_leaf_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1bcb22e66f57d9c718f1a05947e519884c222148 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/is_leaf_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API is_leaf { + using schema = bool (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::is_leaf"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "is_leaf(Tensor self) -> bool"; + static bool call(const at::Tensor & self); + static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isfinite_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isfinite_native.h new file mode 100644 index 0000000000000000000000000000000000000000..318a706b2b09cc58386b9a950b7fbbdf2a1ba194 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isfinite_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor isfinite(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isin_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isin_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e0b8bb6b7509189ba7cfeee57ae615b76a43352d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isin_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_isin_Tensor_Tensor_out : public at::meta::structured_isin_Tensor_Tensor { +void impl(const at::Tensor & elements, const at::Tensor & test_elements, bool assume_unique, bool invert, const at::Tensor & out); +}; +struct TORCH_API structured_isin_Tensor_Scalar_out : public at::meta::structured_isin_Tensor_Scalar { +void impl(const at::Tensor & elements, const at::Scalar & test_element, bool assume_unique, bool invert, const at::Tensor & out); +}; +struct TORCH_API structured_isin_Scalar_Tensor_out : public at::meta::structured_isin_Scalar_Tensor { +void impl(const at::Scalar & element, const at::Tensor & test_elements, bool assume_unique, bool invert, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..02f17f66c851dc2070e21109644f831eff134f27 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isnan_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API isnan { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isnan"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "isnan(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API isnan_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isnan"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "isnan.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ee502e46a50bf9eba4efebd7bcf2d78064bf5858 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isneginf_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API isneginf { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isneginf"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "isneginf(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API isneginf_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::isneginf"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "isneginf.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isposinf_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isposinf_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a389e107dca6fe2796fb306c8830e96752f55fff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/isposinf_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_isposinf_out : public at::meta::structured_isposinf { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +TORCH_API at::Tensor NestedTensor_isposinf(const at::Tensor & self); +TORCH_API at::Tensor isposinf_sparse(const at::Tensor & self); +TORCH_API at::Tensor & isposinf_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor isposinf_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & isposinf_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kron.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kron.h new file mode 100644 index 0000000000000000000000000000000000000000..b4a689c2f9a698cf63fcfe617c4de5fd94842823 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kron.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::kron(Tensor self, Tensor other) -> Tensor +inline at::Tensor kron(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::kron::call(self, other); +} + +// aten::kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & kron_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::kron_out::call(self, other, out); +} +// aten::kron.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & kron_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::kron_out::call(self, other, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kthvalue_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kthvalue_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cf2c4630c0c5b7abe65f12acbd3e41d42991e864 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kthvalue_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple kthvalue_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t k, int64_t dim=-1, bool keepdim=false); +TORCH_API ::std::tuple kthvalue_outf(const at::Tensor & self, int64_t k, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple kthvalue_symint_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, c10::SymInt k, int64_t dim=-1, bool keepdim=false); +TORCH_API ::std::tuple kthvalue_symint_outf(const at::Tensor & self, c10::SymInt k, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kthvalue_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kthvalue_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..895466421feddb32d6d753e53f6fc63bb4a149d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/kthvalue_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API kthvalue { + using schema = ::std::tuple (const at::Tensor &, c10::SymInt, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::kthvalue"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "kthvalue(Tensor self, SymInt k, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, c10::SymInt k, int64_t dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt k, int64_t dim, bool keepdim); +}; + +struct TORCH_API kthvalue_values { + using schema = ::std::tuple (const at::Tensor &, c10::SymInt, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::kthvalue"; + static constexpr const char* overload_name = "values"; + static constexpr const char* schema_str = "kthvalue.values(Tensor self, SymInt k, int dim=-1, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, c10::SymInt k, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt k, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API kthvalue_dimname { + using schema = ::std::tuple (const at::Tensor &, c10::SymInt, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::kthvalue"; + static constexpr const char* overload_name = "dimname"; + static constexpr const char* schema_str = "kthvalue.dimname(Tensor self, SymInt k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, c10::SymInt k, at::Dimname dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt k, at::Dimname dim, bool keepdim); +}; + +struct TORCH_API kthvalue_dimname_out { + using schema = ::std::tuple (const at::Tensor &, c10::SymInt, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::kthvalue"; + static constexpr const char* overload_name = "dimname_out"; + static constexpr const char* schema_str = "kthvalue.dimname_out(Tensor self, SymInt k, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, c10::SymInt k, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt k, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ldexp_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ldexp_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..40914e8661c887f8a5f8dab0104efb5d2ead776f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ldexp_compositeimplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor ldexp(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & ldexp_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & ldexp_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & ldexp_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le.h new file mode 100644 index 0000000000000000000000000000000000000000..d8cb95b007c766aa23c3e24135fb2e73d8e407f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) { + return at::_ops::le_Scalar_out::call(self, other, out); +} +// aten::le.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & le_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) { + return at::_ops::le_Scalar_out::call(self, other, out); +} + +// aten::le.Scalar(Tensor self, Scalar other) -> Tensor +inline at::Tensor le(const at::Tensor & self, const at::Scalar & other) { + return at::_ops::le_Scalar::call(self, other); +} + +// aten::le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::le_Tensor_out::call(self, other, out); +} +// aten::le.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & le_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::le_Tensor_out::call(self, other, out); +} + +// aten::le.Tensor(Tensor self, Tensor other) -> Tensor +inline at::Tensor le(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::le_Tensor::call(self, other); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6a2d604df85df4dfdea2c169a76a7d067dd0a18e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_cuda_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor le(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & le_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor le(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & le_(at::Tensor & self, const at::Tensor & other); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..06ddf8a83cc953355e7852d41f87f5c36b4a29c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_le_Scalar : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & other); +}; +struct TORCH_API structured_le_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e9bdccfdd8a305448ced0e21165ecc054110d857 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/le_native.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_le_Scalar_out : public at::meta::structured_le_Scalar { +void impl(const at::Tensor & self, const at::Scalar & other, const at::Tensor & out); +}; +TORCH_API at::Tensor le_quantized_cpu(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & le_out_quantized_cpu(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +struct TORCH_API structured_le_Tensor_out : public at::meta::structured_le_Tensor { +void impl(const at::Tensor & self, const at::Tensor & other, const at::Tensor & out); +}; +TORCH_API at::Tensor le_quantized_cpu(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & le_out_quantized_cpu(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..07ef30f88c8cc904c8799d2690387a1d1e304d77 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor leaky_relu_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result); +TORCH_API at::Tensor & leaky_relu_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result); +TORCH_API at::Tensor & leaky_relu_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..0f0d6e14ccc384357cbb078dbcd38f5339179da8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/leaky_relu_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_leaky_relu : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & negative_slope); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/less_equal_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/less_equal_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..609c300aee040daea066866817d71e990bdce7db --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/less_equal_compositeimplicitautograd_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor less_equal(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & less_equal_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & less_equal_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & less_equal_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor less_equal(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & less_equal_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & less_equal_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & less_equal_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c559fdf200341af4c473aaa9399a78de2addd9bb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple linalg_cholesky_ex(const at::Tensor & self, bool upper=false, bool check_errors=false); +TORCH_API ::std::tuple linalg_cholesky_ex_out(at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper=false, bool check_errors=false); +TORCH_API ::std::tuple linalg_cholesky_ex_outf(const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2852f9a52d35dc55c401085cb1742bfee77a8a67 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cholesky_ex_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_cholesky_ex { + using schema = ::std::tuple (const at::Tensor &, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_cholesky_ex"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)"; + static ::std::tuple call(const at::Tensor & self, bool upper, bool check_errors); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper, bool check_errors); +}; + +struct TORCH_API linalg_cholesky_ex_L { + using schema = ::std::tuple (const at::Tensor &, bool, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_cholesky_ex"; + static constexpr const char* overload_name = "L"; + static constexpr const char* schema_str = "linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)"; + static ::std::tuple call(const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cond_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cond_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..707e142ab188c67d8e5c00f21d972ec331bc178a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cond_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor linalg_cond(const at::Tensor & self, const ::std::optional & p=::std::nullopt); +TORCH_API at::Tensor & linalg_cond_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p=::std::nullopt); +TORCH_API at::Tensor & linalg_cond_outf(const at::Tensor & self, const ::std::optional & p, at::Tensor & out); +TORCH_API at::Tensor linalg_cond(const at::Tensor & self, c10::string_view p); +TORCH_API at::Tensor & linalg_cond_out(at::Tensor & out, const at::Tensor & self, c10::string_view p); +TORCH_API at::Tensor & linalg_cond_outf(const at::Tensor & self, c10::string_view p, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..13ee1be502af31f5c1a194e2936b4bfa3f3db803 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_cross_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor linalg_cross(const at::Tensor & self, const at::Tensor & other, int64_t dim=-1); +TORCH_API at::Tensor & linalg_cross_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t dim=-1); +TORCH_API at::Tensor & linalg_cross_outf(const at::Tensor & self, const at::Tensor & other, int64_t dim, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_det.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_det.h new file mode 100644 index 0000000000000000000000000000000000000000..7121a2922f083105618084322c1a9a164d69fe1b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_det.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_det(Tensor A) -> Tensor +inline at::Tensor linalg_det(const at::Tensor & A) { + return at::_ops::linalg_det::call(A); +} + +// aten::linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_det_out(at::Tensor & out, const at::Tensor & A) { + return at::_ops::linalg_det_out::call(A, out); +} +// aten::linalg_det.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_det_outf(const at::Tensor & A, at::Tensor & out) { + return at::_ops::linalg_det_out::call(A, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c0a0fa5bb6db19163d9625b03015d185144a2c4f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple linalg_eig(const at::Tensor & self); +TORCH_API ::std::tuple linalg_eig_out(at::Tensor & eigenvalues, at::Tensor & eigenvectors, const at::Tensor & self); +TORCH_API ::std::tuple linalg_eig_outf(const at::Tensor & self, at::Tensor & eigenvalues, at::Tensor & eigenvectors); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7ecfc5673ab65e45818701bca883081416f2d526 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eig_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_eig { + using schema = ::std::tuple (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_eig"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linalg_eig(Tensor self) -> (Tensor eigenvalues, Tensor eigenvectors)"; + static ::std::tuple call(const at::Tensor & self); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API linalg_eig_out { + using schema = ::std::tuple (const at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_eig"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linalg_eig.out(Tensor self, *, Tensor(a!) eigenvalues, Tensor(b!) eigenvectors) -> (Tensor(a!) eigenvalues, Tensor(b!) eigenvectors)"; + static ::std::tuple call(const at::Tensor & self, at::Tensor & eigenvalues, at::Tensor & eigenvectors); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & eigenvalues, at::Tensor & eigenvectors); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0755332f4fc6cadebbdf61638619584d603f7400 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_eigh_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple linalg_eigh(const at::Tensor & self, c10::string_view UPLO="L"); +TORCH_API ::std::tuple linalg_eigh_out(const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d830b2033bc422b77c72b8ea7900c08ee48a80c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_householder_product_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor linalg_householder_product(const at::Tensor & input, const at::Tensor & tau); +TORCH_API at::Tensor & linalg_householder_product_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & tau); +TORCH_API at::Tensor & linalg_householder_product_outf(const at::Tensor & input, const at::Tensor & tau, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex.h new file mode 100644 index 0000000000000000000000000000000000000000..d2cacef7c42eabe4b91aa5491259968dff797c56 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ex.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_inv_ex(Tensor A, *, bool check_errors=False) -> (Tensor inverse, Tensor info) +inline ::std::tuple linalg_inv_ex(const at::Tensor & A, bool check_errors=false) { + return at::_ops::linalg_inv_ex::call(A, check_errors); +} + +// aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info) +inline ::std::tuple linalg_inv_ex_out(at::Tensor & inverse, at::Tensor & info, const at::Tensor & A, bool check_errors=false) { + return at::_ops::linalg_inv_ex_inverse::call(A, check_errors, inverse, info); +} +// aten::linalg_inv_ex.inverse(Tensor A, *, bool check_errors=False, Tensor(a!) inverse, Tensor(b!) info) -> (Tensor(a!) inverse, Tensor(b!) info) +inline ::std::tuple linalg_inv_ex_outf(const at::Tensor & A, bool check_errors, at::Tensor & inverse, at::Tensor & info) { + return at::_ops::linalg_inv_ex_inverse::call(A, check_errors, inverse, info); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c4a88c0324c4ed89e245a0fdad461b309dd94367 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_inv_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_inv { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_inv"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linalg_inv(Tensor A) -> Tensor"; + static at::Tensor call(const at::Tensor & A); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A); +}; + +struct TORCH_API linalg_inv_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_inv"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linalg_inv.out(Tensor A, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & A, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_solve_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_solve_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..20f951810cc43e6e636cc5b10adc27886eb058d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_ldl_solve_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor linalg_ldl_solve(const at::Tensor & LD, const at::Tensor & pivots, const at::Tensor & B, bool hermitian=false); +TORCH_API at::Tensor & linalg_ldl_solve_out(at::Tensor & out, const at::Tensor & LD, const at::Tensor & pivots, const at::Tensor & B, bool hermitian=false); +TORCH_API at::Tensor & linalg_ldl_solve_outf(const at::Tensor & LD, const at::Tensor & pivots, const at::Tensor & B, bool hermitian, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex.h new file mode 100644 index 0000000000000000000000000000000000000000..596fb4a281eb626f2c8dac783bc1103e1f920174 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info) +inline ::std::tuple linalg_lu_factor_ex(const at::Tensor & A, bool pivot=true, bool check_errors=false) { + return at::_ops::linalg_lu_factor_ex::call(A, pivot, check_errors); +} + +// aten::linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) +inline ::std::tuple linalg_lu_factor_ex_out(at::Tensor & LU, at::Tensor & pivots, at::Tensor & info, const at::Tensor & A, bool pivot=true, bool check_errors=false) { + return at::_ops::linalg_lu_factor_ex_out::call(A, pivot, check_errors, LU, pivots, info); +} +// aten::linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) +inline ::std::tuple linalg_lu_factor_ex_outf(const at::Tensor & A, bool pivot, bool check_errors, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info) { + return at::_ops::linalg_lu_factor_ex_out::call(A, pivot, check_errors, LU, pivots, info); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..47195dca4ac69bf24720dc37a2edf08b3cae6227 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_linalg_lu_factor_ex : public at::impl::MetaBase { + + + void meta(const at::Tensor & A, bool pivot, bool check_errors); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c88a0f8f3b2439d63297908f880e709922ed6a0a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_lu_factor_ex { + using schema = ::std::tuple (const at::Tensor &, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_lu_factor_ex"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)"; + static ::std::tuple call(const at::Tensor & A, bool pivot, bool check_errors); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot, bool check_errors); +}; + +struct TORCH_API linalg_lu_factor_ex_out { + using schema = ::std::tuple (const at::Tensor &, bool, bool, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_lu_factor_ex"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linalg_lu_factor_ex.out(Tensor A, *, bool pivot=True, bool check_errors=False, Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info) -> (Tensor(a!) LU, Tensor(b!) pivots, Tensor(c!) info)"; + static ::std::tuple call(const at::Tensor & A, bool pivot, bool check_errors, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, bool pivot, bool check_errors, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..38a55b08e3d38aee1d1171f790092d0f5196b67c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_lu_solve_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor linalg_lu_solve(const at::Tensor & LU, const at::Tensor & pivots, const at::Tensor & B, bool left=true, bool adjoint=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_exp_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_exp_native.h new file mode 100644 index 0000000000000000000000000000000000000000..dc562d898c9bef25c41b7241368a0c46bc226c4e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_matrix_exp_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & linalg_matrix_exp_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor linalg_matrix_exp(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5cbfe8d1e219d3dd35b4f2fd854c05532223647a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple linalg_qr(const at::Tensor & A, c10::string_view mode="reduced"); +TORCH_API ::std::tuple linalg_qr_out(at::Tensor & Q, at::Tensor & R, const at::Tensor & A, c10::string_view mode="reduced"); +TORCH_API ::std::tuple linalg_qr_outf(const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..26e9eea0b27f31b012e98c7435a135a2fbc54518 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_qr_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_qr { + using schema = ::std::tuple (const at::Tensor &, c10::string_view); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_qr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)"; + static ::std::tuple call(const at::Tensor & A, c10::string_view mode); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view mode); +}; + +struct TORCH_API linalg_qr_out { + using schema = ::std::tuple (const at::Tensor &, c10::string_view, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linalg_qr"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)"; + static ::std::tuple call(const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_solve_ex_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_solve_ex_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..802d657ffc3028c967cdb269996dbb92f8d80f07 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_solve_ex_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple linalg_solve_ex(const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false); +TORCH_API ::std::tuple linalg_solve_ex_out(at::Tensor & result, at::Tensor & info, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false); +TORCH_API ::std::tuple linalg_solve_ex_outf(const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & info); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_solve_triangular.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_solve_triangular.h new file mode 100644 index 0000000000000000000000000000000000000000..65302752c92320fca592cf89ad38b14a89c43031 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_solve_triangular.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_solve_triangular_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & B, bool upper, bool left=true, bool unitriangular=false) { + return at::_ops::linalg_solve_triangular_out::call(self, B, upper, left, unitriangular, out); +} +// aten::linalg_solve_triangular.out(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_solve_triangular_outf(const at::Tensor & self, const at::Tensor & B, bool upper, bool left, bool unitriangular, at::Tensor & out) { + return at::_ops::linalg_solve_triangular_out::call(self, B, upper, left, unitriangular, out); +} + +// aten::linalg_solve_triangular(Tensor self, Tensor B, *, bool upper, bool left=True, bool unitriangular=False) -> Tensor +inline at::Tensor linalg_solve_triangular(const at::Tensor & self, const at::Tensor & B, bool upper, bool left=true, bool unitriangular=false) { + return at::_ops::linalg_solve_triangular::call(self, B, upper, left, unitriangular); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vander_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vander_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5c44fddb60852d70ef049e91f8ec41be9052c759 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vander_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor linalg_vander(const at::Tensor & x, ::std::optional N=::std::nullopt); +TORCH_API at::Tensor linalg_vander_symint(const at::Tensor & x, ::std::optional N=::std::nullopt); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vector_norm.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vector_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..77c85ebb64dc26118d3fd731e37ba8155a73c06f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linalg_vector_norm.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor linalg_vector_norm(const at::Tensor & self, const at::Scalar & ord=2, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::linalg_vector_norm::call(self, ord, dim, keepdim, dtype); +} + +// aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_vector_norm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & ord=2, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::linalg_vector_norm_out::call(self, ord, dim, keepdim, dtype, out); +} +// aten::linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & linalg_vector_norm_outf(const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::linalg_vector_norm_out::call(self, ord, dim, keepdim, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linear_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linear_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4af6aa47faf6a782527270e967cc00cfcf0ec7d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linear_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor linear(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}); +TORCH_API at::Tensor & linear_out(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::Tensor & out); +TORCH_API at::Tensor nested_linear(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linear_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linear_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..70ce7e0ad76578be8bec51b69b49f1d078ace17f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linear_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linear { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linear"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias); +}; + +struct TORCH_API linear_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linear"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linear.out(Tensor input, Tensor weight, Tensor? bias=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..04e1219c0356c9f0a729533f219e511dcfe48611 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & linspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps); +TORCH_API at::Tensor & linspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..38386ff8cfe8af30828a631816ab2c380afb79c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/linspace_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linspace { + using schema = at::Tensor (const at::Scalar &, const at::Scalar &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "linspace(Scalar start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Scalar & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_Tensor_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Tensor"; + static constexpr const char* schema_str = "linspace.Tensor_Tensor(Tensor start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Tensor & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_Tensor_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Scalar"; + static constexpr const char* schema_str = "linspace.Tensor_Scalar(Tensor start, Scalar end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Tensor & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_Scalar_Tensor { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &, int64_t, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Scalar_Tensor"; + static constexpr const char* schema_str = "linspace.Scalar_Tensor(Scalar start, Tensor end, int steps, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor"; + static at::Tensor call(const at::Scalar & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API linspace_out { + using schema = at::Tensor & (const at::Scalar &, const at::Scalar &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "linspace.out(Scalar start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out); +}; + +struct TORCH_API linspace_Tensor_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Tensor_out"; + static constexpr const char* schema_str = "linspace.Tensor_Tensor_out(Tensor start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & start, const at::Tensor & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Tensor & end, int64_t steps, at::Tensor & out); +}; + +struct TORCH_API linspace_Tensor_Scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Tensor_Scalar_out"; + static constexpr const char* schema_str = "linspace.Tensor_Scalar_out(Tensor start, Scalar end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & start, const at::Scalar & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & start, const at::Scalar & end, int64_t steps, at::Tensor & out); +}; + +struct TORCH_API linspace_Scalar_Tensor_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::linspace"; + static constexpr const char* overload_name = "Scalar_Tensor_out"; + static constexpr const char* schema_str = "linspace.Scalar_Tensor_out(Scalar start, Tensor end, int steps, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & start, const at::Tensor & end, int64_t steps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & start, const at::Tensor & end, int64_t steps, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c3b4492ee9f3993e6ee02ea34b37c683a66ac4a6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log1p_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor log1p(const at::Tensor & self); +TORCH_API at::Tensor & log1p_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & log1p_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & log1p_(at::Tensor & self); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..243d004fdbae71876df890a7d82d2b33c1e7f941 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor log(const at::Tensor & self); +TORCH_API at::Tensor & log_(at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5d90cb1da6e4c6bba3c7b211270c1b6323e43fe2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor log(const at::Tensor & self); +TORCH_API at::Tensor & log_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & log_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & log_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_normal_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_normal_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..df780a6a1ce45523a72dd9feb3e4e15147d6301b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_normal_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & log_normal_(at::Tensor & self, double mean=1, double std=2, ::std::optional generator=::std::nullopt); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax.h new file mode 100644 index 0000000000000000000000000000000000000000..f0f4595d6f6cebfce18ef5c4ef1b0aed89ff1032 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor +inline at::Tensor log_softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::log_softmax_int::call(self, dim, dtype); +} + +// aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::log_softmax_int_out::call(self, dim, dtype, out); +} +// aten::log_softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & log_softmax_outf(const at::Tensor & self, int64_t dim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::log_softmax_int_out::call(self, dim, dtype, out); +} + +// aten::log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor log_softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt) { + return at::_ops::log_softmax_Dimname::call(self, dim, dtype); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..40066708f61a85f4c1caf7d1f2862d630c2578ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/log_softmax_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & log_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & log_softmax_outf(const at::Tensor & self, int64_t dim, ::std::optional dtype, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2a0502d79031fd0cdeb6cf72e2700f8cef890115 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logaddexp2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logaddexp2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a82090c3d80abf60be679baa606476d2580b329d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logaddexp2_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logaddexp2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & logaddexp2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp.h new file mode 100644 index 0000000000000000000000000000000000000000..38bd065af34fef814349283c1584b66df350449d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logcumsumexp.h @@ -0,0 +1,59 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::logcumsumexp(Tensor self, int dim) -> Tensor +inline at::Tensor logcumsumexp(const at::Tensor & self, int64_t dim) { + return at::_ops::logcumsumexp::call(self, dim); +} + +// aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim) { + return at::_ops::logcumsumexp_out::call(self, dim, out); +} +// aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out) { + return at::_ops::logcumsumexp_out::call(self, dim, out); +} + +// aten::logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor +inline at::Tensor logcumsumexp(const at::Tensor & self, at::Dimname dim) { + return at::_ops::logcumsumexp_dimname::call(self, dim); +} + +// aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim) { + return at::_ops::logcumsumexp_dimname_out::call(self, dim, out); +} +// aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logcumsumexp_outf(const at::Tensor & self, at::Dimname dim, at::Tensor & out) { + return at::_ops::logcumsumexp_dimname_out::call(self, dim, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logdet_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logdet_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..58442e1279a4ef42018597380eafd0981278c699 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logdet_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor logdet(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_not_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_not_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..819366780c69cea6582c64b901f9f29e459a8b51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logical_not_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & logical_not_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & logical_not_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..43492ebe60e9259aedfb618d252393c005f3810e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_logit_backward_out : public at::meta::structured_logit_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, ::std::optional eps, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fa426ce05eb9522fe700c7b3572a6eaf4bb5353b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logit_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API logit { + using schema = at::Tensor (const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::logit"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "logit(Tensor self, float? eps=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional eps); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional eps); +}; + +struct TORCH_API logit_ { + using schema = at::Tensor & (at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::logit_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "logit_(Tensor(a!) self, float? eps=None) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, ::std::optional eps); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, ::std::optional eps); +}; + +struct TORCH_API logit_out { + using schema = at::Tensor & (const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::logit"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "logit.out(Tensor self, float? eps=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, ::std::optional eps, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional eps, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8552951b6e282d4acef105889566d69c98de73dc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logspace_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & logspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps, double base=10.0); +TORCH_API at::Tensor & logspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d66a1906385fa3dac13903eba6112e82173bb07a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/logsumexp_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor logsumexp(const at::Tensor & self, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & logsumexp_out(at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & logsumexp_outf(const at::Tensor & self, at::DimnameList dim, bool keepdim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lshift_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lshift_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c4b6d4306cb2d789313a5f9ef2b80a2a471864ab --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lshift_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & __lshift___out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & __lshift___outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +TORCH_API at::Tensor & __lshift___out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & __lshift___outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lu_solve_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lu_solve_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e15a25c12e713c201608e6f308eeea5644863a95 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lu_solve_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor lu_solve(const at::Tensor & self, const at::Tensor & LU_data, const at::Tensor & LU_pivots); +TORCH_API at::Tensor & lu_solve_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & LU_data, const at::Tensor & LU_pivots); +TORCH_API at::Tensor & lu_solve_outf(const at::Tensor & self, const at::Tensor & LU_data, const at::Tensor & LU_pivots, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lu_unpack_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lu_unpack_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e9ece371a9c7eefe9f2b80efda795d3248d024e5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/lu_unpack_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API ::std::tuple lu_unpack(const at::Tensor & LU_data, const at::Tensor & LU_pivots, bool unpack_data=true, bool unpack_pivots=true); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ba6cb70cc3e2a6570e1f4633a1e38ffce5657b43 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_fill_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Scalar & value); +TORCH_API at::Tensor & masked_fill_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & value); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ee2338a13b4a02ca999367fb5ff8fc1464084b58 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/masked_scatter_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & masked_scatter_(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5703aed0a370c3e3ff8e460e40dacb2cdcd9552b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor matmul(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & matmul_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & matmul_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4ef287e48b34e273e8df3a51da50ccc1634ca873 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matmul_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor matmul(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor matmul_nested(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & matmul_out_nested(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H.h new file mode 100644 index 0000000000000000000000000000000000000000..11efff31192701882f21d5a26fc10987ee5f572b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fb998b9b729ef80bf11ace1c4b8214d57fba611f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_H_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor matrix_H(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_exp_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_exp_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..77e82d3d3c74e7381bcfc7f18f3cabf52e36b676 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_exp_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor matrix_exp(const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_power.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_power.h new file mode 100644 index 0000000000000000000000000000000000000000..37f2f26354b0679784f1dad9f6505f579166a926 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/matrix_power.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::matrix_power(Tensor self, int n) -> Tensor +inline at::Tensor matrix_power(const at::Tensor & self, int64_t n) { + return at::_ops::matrix_power::call(self, n); +} + +// aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & matrix_power_out(at::Tensor & out, const at::Tensor & self, int64_t n) { + return at::_ops::matrix_power_out::call(self, n, out); +} +// aten::matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & matrix_power_outf(const at::Tensor & self, int64_t n, at::Tensor & out) { + return at::_ops::matrix_power_out::call(self, n, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9553f183d4b7d164df8519e020f8a00182ea69f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API ::std::tuple max(const at::Tensor & self, int64_t dim, bool keepdim=false); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool1d_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool1d_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..60ac4a01828a2db5c84024765558082cf44c0f64 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool1d_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor max_pool1d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d7b2ebd178c8fb4173978f739d2e8a0568537384 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor max_pool2d_with_indices_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..47c47f4872d7408cc443c56bcc32c589604ad73b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple max_pool2d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +TORCH_API ::std::tuple max_pool2d_with_indices_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +TORCH_API ::std::tuple max_pool2d_with_indices_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/maximum_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/maximum_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..240825ac4ed896f6f6e63dc7f33e9f8811aeb5c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/maximum_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor maximum(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & maximum_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & maximum_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mean_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mean_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fd8b1963efe6396bde2217caa8200a53d1a6473e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mean_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor mean(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mean_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mean_native.h new file mode 100644 index 0000000000000000000000000000000000000000..fb8b34a3a8527e41133c8e6d5293dde599e8a112 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mean_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +TORCH_API at::Tensor mean(const at::Tensor & self, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_dtype_out(const at::Tensor & self, ::std::optional dtype, at::Tensor & out); +struct TORCH_API structured_mean_out : public at::meta::structured_mean_dim { +void impl(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, const at::Tensor & out); +}; +TORCH_API at::Tensor mean_quantized_cpu(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_out_quantized_cpu(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor mean(const at::Tensor & self, at::DimnameList dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & mean_out(const at::Tensor & self, at::DimnameList dim, bool keepdim, ::std::optional dtype, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median.h new file mode 100644 index 0000000000000000000000000000000000000000..d223addd2b27bda52d4b3ac16b5cb80b9d7834b9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::median(Tensor self) -> Tensor +inline at::Tensor median(const at::Tensor & self) { + return at::_ops::median::call(self); +} + +// aten::median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices) +inline ::std::tuple median(const at::Tensor & self, int64_t dim, bool keepdim=false) { + return at::_ops::median_dim::call(self, dim, keepdim); +} + +// aten::median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) +inline ::std::tuple median_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, int64_t dim, bool keepdim=false) { + return at::_ops::median_dim_values::call(self, dim, keepdim, values, indices); +} +// aten::median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) +inline ::std::tuple median_outf(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices) { + return at::_ops::median_dim_values::call(self, dim, keepdim, values, indices); +} + +// aten::median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices) +inline ::std::tuple median(const at::Tensor & self, at::Dimname dim, bool keepdim=false) { + return at::_ops::median_names_dim::call(self, dim, keepdim); +} + +// aten::median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) +inline ::std::tuple median_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim, bool keepdim=false) { + return at::_ops::median_names_dim_values::call(self, dim, keepdim, values, indices); +} +// aten::median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices) +inline ::std::tuple median_outf(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices) { + return at::_ops::median_names_dim_values::call(self, dim, keepdim, values, indices); +} + +// aten::median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & median_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::median_out::call(self, out); +} +// aten::median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & median_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::median_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c11ed2cbcfdd794f58454ee98479f3f2e2f05afd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & median_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor median_cpu(const at::Tensor & self); +TORCH_API at::Tensor median_cuda(const at::Tensor & self); +TORCH_API ::std::tuple median(const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API ::std::tuple median_out_cpu(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple median_out_cuda(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple median(const at::Tensor & self, at::Dimname dim, bool keepdim=false); +TORCH_API ::std::tuple median_out(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..47b96ce1288dd84f0d74fea75e207fcd160424c3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/median_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API median { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "median(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API median_dim { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "dim"; + static constexpr const char* schema_str = "median.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim); +}; + +struct TORCH_API median_dim_values { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "dim_values"; + static constexpr const char* schema_str = "median.dim_values(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API median_names_dim { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "names_dim"; + static constexpr const char* schema_str = "median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim); +}; + +struct TORCH_API median_names_dim_values { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "names_dim_values"; + static constexpr const char* schema_str = "median.names_dim_values(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API median_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::median"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "median.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/min_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/min_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a69824da016224e56214b693603bbfec224a4aa8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/min_native.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_min_out : public at::meta::structured_min_dim { +void impl(const at::Tensor & self, int64_t dim, bool keepdim, const at::Tensor & min, const at::Tensor & min_indices); +}; +TORCH_API ::std::tuple qmin(const at::Tensor & self, int64_t dim, bool keepdim=false); +TORCH_API ::std::tuple min(const at::Tensor & self, at::Dimname dim, bool keepdim=false); +TORCH_API ::std::tuple min_out(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices); +TORCH_API at::Tensor min(const at::Tensor & self); +TORCH_API at::Tensor & min_unary_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor min_quantized_cpu(const at::Tensor & self); +TORCH_API at::Tensor & min_quantized_unary_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor min(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & min_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/min_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/min_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2ef6621381e6624e6bf2ff465dc95fb24c7d1aa6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/min_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API min_dim { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "dim"; + static constexpr const char* schema_str = "min.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim); +}; + +struct TORCH_API min_dim_min { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "dim_min"; + static constexpr const char* schema_str = "min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices); +}; + +struct TORCH_API min_names_dim { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "names_dim"; + static constexpr const char* schema_str = "min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool keepdim); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim); +}; + +struct TORCH_API min_names_dim_min { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "names_dim_min"; + static constexpr const char* schema_str = "min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & min, at::Tensor & min_indices); +}; + +struct TORCH_API min { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "min(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API min_unary_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "unary_out"; + static constexpr const char* schema_str = "min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +struct TORCH_API min_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "min.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API min_other { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::min"; + static constexpr const char* overload_name = "other"; + static constexpr const char* schema_str = "min.other(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_batch_norm_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_batch_norm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b656be52beb44303a09c7b94279202630ffb78f7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_batch_norm_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple miopen_batch_norm_out(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double exponential_average_factor, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +TORCH_API ::std::tuple miopen_batch_norm(const at::Tensor & input, const at::Tensor & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double exponential_average_factor, double epsilon); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..15eb0aa29d74e40d5bd2f9699e6da064a50591bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_add_relu_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor miopen_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional & alpha, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups); +TORCH_API at::Tensor miopen_convolution_add_relu_symint(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const ::std::optional & alpha, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..137eba207341a54439ca3cf1b18f603b1e339626 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API miopen_convolution_relu { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::miopen_convolution_relu"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_depthwise_convolution_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_depthwise_convolution_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3938591bfc106f4468e619e71b74de522f50d652 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_depthwise_convolution_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & miopen_depthwise_convolution_out_symint(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out); +TORCH_API at::Tensor miopen_depthwise_convolution(const at::Tensor & self, const at::Tensor & weight, const ::std::optional & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_rnn_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_rnn_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fc509d4fa1427358e487add4b4babde7760688c8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_rnn_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple> miopen_rnn_backward(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & weight_buf, const at::Tensor & hx, const ::std::optional & cx, const at::Tensor & output, const ::std::optional & grad_output, const ::std::optional & grad_hy, const ::std::optional & grad_cy, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const ::std::optional & dropout_state, const at::Tensor & reserve, ::std::array output_mask); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_rnn_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_rnn_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f497b5b8ca41bc1d7a9d98df441092c025f50630 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/miopen_rnn_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple miopen_rnn_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const ::std::optional & dropout_state); +TORCH_API ::std::tuple miopen_rnn_outf(const at::Tensor & input, at::TensorList weight, int64_t weight_stride0, const at::Tensor & hx, const ::std::optional & cx, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, double dropout, bool train, bool bidirectional, at::IntArrayRef batch_sizes, const ::std::optional & dropout_state, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0a5d456fbaf21dd9d6886907eb89985118b07377 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mish_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor mish_backward(const at::Tensor & grad_output, const at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..5d1bee44121dcccfdfb99f701c1a1f57a528b88b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor +inline at::Tensor mkldnn_adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::mkldnn_adaptive_avg_pool2d_backward::call(grad_output, self); +} + +// aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_adaptive_avg_pool2d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::mkldnn_adaptive_avg_pool2d_backward_out::call(grad_output, self, out); +} +// aten::mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_adaptive_avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out) { + return at::_ops::mkldnn_adaptive_avg_pool2d_backward_out::call(grad_output, self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3125938f98b5cca430916902a0e7f8fee69c2e26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API mkldnn_adaptive_avg_pool2d_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d_backward(Tensor grad_output, Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self); +}; + +struct TORCH_API mkldnn_adaptive_avg_pool2d_backward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d_backward"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d_backward.out(Tensor grad_output, Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_input_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_input_native.h new file mode 100644 index 0000000000000000000000000000000000000000..15c32557261103be1fbaf1264f6e496671486fc9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_linear_backward_input_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & mkldnn_linear_backward_input_out(at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight, at::Tensor & out); +TORCH_API at::Tensor mkldnn_linear_backward_input(at::IntArrayRef input_size, const at::Tensor & grad_output, const at::Tensor & weight); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..55ba9a65851936d4fd7855811a69319256148996 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & mkldnn_max_pool2d_backward_out(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out); +TORCH_API at::Tensor mkldnn_max_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & input, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv3d_weight.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv3d_weight.h new file mode 100644 index 0000000000000000000000000000000000000000..a99274afc925dd73a67ed0a0cae58026f2fdc899 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_reorder_conv3d_weight.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor +inline at::Tensor mkldnn_reorder_conv3d_weight(const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt); +} +namespace symint { + template >> + at::Tensor mkldnn_reorder_conv3d_weight(const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt); + } +} + +// aten::mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor +inline at::Tensor mkldnn_reorder_conv3d_weight_symint(const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, padding, stride, dilation, groups, input_size); +} +namespace symint { + template >> + at::Tensor mkldnn_reorder_conv3d_weight(const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight::call(self, padding, stride, dilation, groups, input_size); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding=0, at::IntArrayRef stride=1, at::IntArrayRef dilation=1, int64_t groups=1, at::OptionalIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_outf(const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_outf(const at::Tensor & self, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, at::OptionalIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(dilation), groups, input_size.has_value() ? ::std::make_optional(c10::fromIntArrayRefSlow(*input_size)) : ::std::nullopt, out); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1, at::OptionalSymIntArrayRef input_size=::std::nullopt) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); + } +} + +// aten::mkldnn_reorder_conv3d_weight.out(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_reorder_conv3d_weight_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); +} +namespace symint { + template >> + at::Tensor & mkldnn_reorder_conv3d_weight_outf(const at::Tensor & self, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, at::OptionalSymIntArrayRef input_size, at::Tensor & out) { + return at::_ops::mkldnn_reorder_conv3d_weight_out::call(self, padding, stride, dilation, groups, input_size, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mm_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6a157f36ff12025a5fff065548f84c11d588ae16 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mm_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor mm(const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & mm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2); +TORCH_API at::Tensor & mm_outf(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7747614a9331d7ae96eff54f71406f561e3ec752 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple mode(const at::Tensor & self, at::Dimname dim, bool keepdim=false); +TORCH_API ::std::tuple mode_out(at::Tensor & values, at::Tensor & indices, const at::Tensor & self, at::Dimname dim, bool keepdim=false); +TORCH_API ::std::tuple mode_outf(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f7456dd2b16ca70de364ac102a53371b9b06ffd1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple mode(const at::Tensor & self, int64_t dim=-1, bool keepdim=false); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_native.h new file mode 100644 index 0000000000000000000000000000000000000000..74169f11d519a492c2479553524d6f021415a3df --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mode_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple mode_out(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +TORCH_API ::std::tuple mode(const at::Tensor & self, int64_t dim=-1, bool keepdim=false); +TORCH_API ::std::tuple mode(const at::Tensor & self, at::Dimname dim, bool keepdim=false); +TORCH_API ::std::tuple mode_out(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & values, at::Tensor & indices); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/moveaxis.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/moveaxis.h new file mode 100644 index 0000000000000000000000000000000000000000..b5b06d803bb8541acae9ce8bb7114f3a3d6a649a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/moveaxis.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::moveaxis.intlist(Tensor(a) self, int[] source, int[] destination) -> Tensor(a) +inline at::Tensor moveaxis(const at::Tensor & self, at::IntArrayRef source, at::IntArrayRef destination) { + return at::_ops::moveaxis_intlist::call(self, source, destination); +} + +// aten::moveaxis.int(Tensor(a) self, int source, int destination) -> Tensor(a) +inline at::Tensor moveaxis(const at::Tensor & self, int64_t source, int64_t destination) { + return at::_ops::moveaxis_int::call(self, source, destination); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9856cf26e8a926b628df7fac2163d441b8d97e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor mse_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API at::Tensor & mse_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API at::Tensor & mse_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..98974d69e234cd57a541975f362b3ce78a6e10c2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor mse_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API at::Tensor & mse_loss_backward_out(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..9ca90b822f06ef50c53bf657bae743ac0f923c89 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor mse_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean); +TORCH_API at::Tensor & mse_loss_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean); +TORCH_API at::Tensor & mse_loss_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8fd3e3282629b466e2e57f4e430a393675f422cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mse_loss_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API mse_loss_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mse_loss"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "mse_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out); +}; + +struct TORCH_API mse_loss { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::mse_loss"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "mse_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & target, int64_t reduction); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/msort_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/msort_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8ac51a9dfd305d228b53642866b5b5ca39a70f11 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/msort_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor msort(const at::Tensor & self); +TORCH_API at::Tensor & msort_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & msort_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..2f17b3eb5501545d10314cd59f8df7765a253e76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_backward.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & multilabel_margin_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target) { + return at::_ops::multilabel_margin_loss_backward_grad_input::call(grad_output, self, target, reduction, is_target, grad_input); +} +// aten::multilabel_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & multilabel_margin_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target, at::Tensor & grad_input) { + return at::_ops::multilabel_margin_loss_backward_grad_input::call(grad_output, self, target, reduction, is_target, grad_input); +} + +// aten::multilabel_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction, Tensor is_target) -> Tensor +inline at::Tensor multilabel_margin_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, const at::Tensor & is_target) { + return at::_ops::multilabel_margin_loss_backward::call(grad_output, self, target, reduction, is_target); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_forward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_forward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c05f570d1b1559a92b5d752727eb4edf995515ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/multilabel_margin_loss_forward_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple multilabel_margin_loss_forward(const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API ::std::tuple multilabel_margin_loss_forward_out(at::Tensor & output, at::Tensor & is_target, const at::Tensor & self, const at::Tensor & target, int64_t reduction); +TORCH_API ::std::tuple multilabel_margin_loss_forward_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & output, at::Tensor & is_target); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..632e2d27322b50735d1585f548a70de09e3349e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & mvlgamma_out(at::Tensor & out, const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_outf(const at::Tensor & self, int64_t p, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b9c6ef27fe6bd312836fcc317ef427e505464ba5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mvlgamma_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor mvlgamma(const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_(at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_out(const at::Tensor & self, int64_t p, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ff79b532251d375f5ad83ffde4d2d012e7287c2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor nan_to_num(const at::Tensor & self, ::std::optional nan=::std::nullopt, ::std::optional posinf=::std::nullopt, ::std::optional neginf=::std::nullopt); +TORCH_API at::Tensor & nan_to_num_(at::Tensor & self, ::std::optional nan=::std::nullopt, ::std::optional posinf=::std::nullopt, ::std::optional neginf=::std::nullopt); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9138eb842b6cfb380143fff9ee40b9a4a5f28d5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nan_to_num_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & nan_to_num_out(at::Tensor & out, const at::Tensor & self, ::std::optional nan=::std::nullopt, ::std::optional posinf=::std::nullopt, ::std::optional neginf=::std::nullopt); +TORCH_API at::Tensor & nan_to_num_outf(const at::Tensor & self, ::std::optional nan, ::std::optional posinf, ::std::optional neginf, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nanmean_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nanmean_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d462705129f912b3f5b0f85fe61dcde72386f3cf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nanmean_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor nanmean(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nanmean_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & nanmean_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum.h new file mode 100644 index 0000000000000000000000000000000000000000..899f24cc1bb55429941782889bc8ca07ff36dc5d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nansum.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::nansum(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor +inline at::Tensor nansum(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::nansum::call(self, dim, keepdim, dtype); +} + +// aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & nansum_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, bool keepdim=false, ::std::optional dtype=::std::nullopt) { + return at::_ops::nansum_out::call(self, dim, keepdim, dtype, out); +} +// aten::nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & nansum_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional dtype, at::Tensor & out) { + return at::_ops::nansum_out::call(self, dim, keepdim, dtype, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..7e9e4f4df60d5b8e748897094422364f1d7efe56 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor +inline at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy::call(self, dim, start, length); +} +namespace symint { + template >> + at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy::call(self, dim, start, length); + } +} + +// aten::narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor +inline at::Tensor narrow_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy::call(self, dim, start, length); +} +namespace symint { + template >> + at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy::call(self, dim, start, length); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t start, int64_t length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_symint_out(at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +// aten::narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & narrow_copy_symint_outf(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); +} +namespace symint { + template >> + at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out) { + return at::_ops::narrow_copy_out::call(self, dim, start, length, out); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d4e3273485581aed6bfd2cebb71b711395112129 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor narrow_copy_dense_cpu(const at::Tensor & self, int64_t dim, int64_t start, int64_t length); +TORCH_API at::Tensor & narrow_copy_dense_cpu_out(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out); +TORCH_API at::Tensor narrow_copy_sparse(const at::Tensor & self, int64_t dim, int64_t start, int64_t length); +TORCH_API at::Tensor narrow_copy_dense_symint(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2b570ae75a2edeb57b015585befd97123fdabb7d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/narrow_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API narrow_copy { + using schema = at::Tensor (const at::Tensor &, int64_t, c10::SymInt, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::narrow_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "narrow_copy(Tensor self, int dim, SymInt start, SymInt length) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length); +}; + +struct TORCH_API narrow_copy_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, c10::SymInt, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::narrow_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "narrow_copy.out(Tensor self, int dim, SymInt start, SymInt length, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a64c03d85e02f3a345540f14983ce480bf669853 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & running_mean, const ::std::optional & running_var, const ::std::optional & save_mean, const ::std::optional & save_invstd, bool train, double eps, ::std::array output_mask); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cc0a4395d1799186f4af12ff02815ea5256570a4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_batch_norm_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple native_batch_norm(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double momentum, double eps); +TORCH_API ::std::tuple native_batch_norm_out(at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd, const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double momentum, double eps); +TORCH_API ::std::tuple native_batch_norm_outf(const at::Tensor & input, const ::std::optional & weight, const ::std::optional & bias, const ::std::optional & running_mean, const ::std::optional & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_channel_shuffle.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_channel_shuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..9c1c4311ddbfa43a7f271fe5d8b136f3edf3ca90 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_channel_shuffle.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::native_channel_shuffle(Tensor self, SymInt groups) -> Tensor +inline at::Tensor native_channel_shuffle(const at::Tensor & self, int64_t groups) { + return at::_ops::native_channel_shuffle::call(self, groups); +} +namespace symint { + template >> + at::Tensor native_channel_shuffle(const at::Tensor & self, int64_t groups) { + return at::_ops::native_channel_shuffle::call(self, groups); + } +} + +// aten::native_channel_shuffle(Tensor self, SymInt groups) -> Tensor +inline at::Tensor native_channel_shuffle_symint(const at::Tensor & self, c10::SymInt groups) { + return at::_ops::native_channel_shuffle::call(self, groups); +} +namespace symint { + template >> + at::Tensor native_channel_shuffle(const at::Tensor & self, c10::SymInt groups) { + return at::_ops::native_channel_shuffle::call(self, groups); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_dropout_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_dropout_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c9a4debaf6a4bc3f8026f7d03274225c8d1e8af4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_dropout_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple native_dropout_out(const at::Tensor & input, double p, ::std::optional train, at::Tensor & out0, at::Tensor & out1); +TORCH_API ::std::tuple native_dropout_cpu(const at::Tensor & input, double p, ::std::optional train); +TORCH_API ::std::tuple native_dropout_cuda(const at::Tensor & input, double p, ::std::optional train); +TORCH_API ::std::tuple native_dropout_nested(const at::Tensor & input, double p, ::std::optional train); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..728689d5dfed6f4979d479064aaaf06c4ad10bec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps); +TORCH_API ::std::tuple native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..84746cd44549a68d5122f1b297d50b247647fedd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps); +TORCH_API ::std::tuple native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..08c49417f494a87d33b943cb305d9c5339ebbc9e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/native_layer_norm_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API native_layer_norm { + using schema = ::std::tuple (const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, const ::std::optional &, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::native_layer_norm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)"; + static ::std::tuple call(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps); +}; + +struct TORCH_API native_layer_norm_out { + using schema = ::std::tuple (const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, const ::std::optional &, double, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::native_layer_norm"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))"; + static ::std::tuple call(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, const ::std::optional & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ne_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ne_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..65930b64f36629d6282628bd0697b5bec551bc65 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ne_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor ne(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & ne_(at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor ne(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & ne_(at::Tensor & self, const at::Tensor & other); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ne_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ne_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..1cc60b9ff6c7436190751fd930db32a966d620f3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ne_meta.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_ne_Scalar : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Scalar & other); +}; +struct TORCH_API structured_ne_Tensor : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..91a196bf8d27f38cc3e0f678137ad7cfb3b62596 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nested_to_padded_tensor { + using schema = at::Tensor (const at::Tensor &, double, at::OptionalIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nested_to_padded_tensor"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nested_to_padded_tensor(Tensor self, float padding, int[]? output_size=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_empty_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_empty_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0104bb07aad2a66588ad9195b78e1e3b55a588bf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/new_empty_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor new_empty_symint(const at::Tensor & self, c10::SymIntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & new_empty_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_forward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_forward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5b935cdac5561593d0f6be7f94289e570db09d8f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_forward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nll_loss2d_forward_output { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss2d_forward"; + static constexpr const char* overload_name = "output"; + static constexpr const char* schema_str = "nll_loss2d_forward.output(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, *, Tensor(a!) output, Tensor(b!) total_weight) -> (Tensor(a!), Tensor(b!))"; + static ::std::tuple call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight); +}; + +struct TORCH_API nll_loss2d_forward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss2d_forward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nll_loss2d_forward(Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index) -> (Tensor output, Tensor total_weight)"; + static ::std::tuple call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..059c54ce91f57de1371ae86d8142a4b8fcfa040d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor nll_loss2d_symint(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight={}, int64_t reduction=at::Reduction::Mean, c10::SymInt ignore_index=-100); +TORCH_API at::Tensor & nll_loss2d_out(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, int64_t ignore_index, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4f7842e8ae3513daa7f600882e840e76fc565235 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss2d_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nll_loss2d_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss2d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "nll_loss2d.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & out); +}; + +struct TORCH_API nll_loss2d { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss2d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7f45037b5bc79394c9f616633392ded79c428e0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nll_loss_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "nll_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input); +}; + +struct TORCH_API nll_loss_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nll_loss_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight, int reduction, SymInt ignore_index, Tensor total_weight) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..9f87c9a235589e28fb5992320344d2d604f1af8b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/nll_loss_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nll_loss_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & out); +}; + +struct TORCH_API nll_loss { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional &, int64_t, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::nll_loss"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, SymInt ignore_index=-100) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..35cff1f22c5a7bc14e3b1fa2e5da76932328eb3b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::ScalarType dtype); +TORCH_API at::Tensor & norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p, at::ScalarType dtype); +TORCH_API at::Tensor & norm_outf(const at::Tensor & self, const ::std::optional & p, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor norm(const at::Tensor & self, const at::Scalar & p=2); +TORCH_API at::Tensor & norm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & p=2); +TORCH_API at::Tensor & norm_outf(const at::Tensor & self, const at::Scalar & p, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a9aab1309a9a344841d92f6a7aa729424535f13c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/norm_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype); +TORCH_API at::Tensor & norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype); +TORCH_API at::Tensor & norm_outf(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor norm(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & norm_out(at::Tensor & out, const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim=false); +TORCH_API at::Tensor & norm_outf(const at::Tensor & self, const ::std::optional & p, at::DimnameList dim, bool keepdim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/normal_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/normal_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b5426a7138902d6e34014068a1f313bcbc0ec320 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/normal_cpu_dispatch.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & normal_(at::Tensor & self, double mean=0, double std=1, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor normal(const at::Tensor & mean, double std=1, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, double std=1, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & normal_outf(const at::Tensor & mean, double std, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor normal(double mean, const at::Tensor & std, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & normal_out(at::Tensor & out, double mean, const at::Tensor & std, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & normal_outf(double mean, const at::Tensor & std, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor normal(const at::Tensor & mean, const at::Tensor & std, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, const at::Tensor & std, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & normal_outf(const at::Tensor & mean, const at::Tensor & std, ::std::optional generator, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ones_like_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ones_like_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..647998ecceb6687e60b8be1603da28b25563e60a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/ones_like_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor ones_like(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); +TORCH_API at::Tensor & ones_like_out(at::Tensor & out, const at::Tensor & self, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor & ones_like_outf(const at::Tensor & self, ::std::optional memory_format, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/output_nr_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/output_nr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..929fe45bd2d036fac48ce62717b2580eece3d60b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/output_nr_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API output_nr { + using schema = int64_t (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::output_nr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "output_nr(Tensor self) -> int"; + static int64_t call(const at::Tensor & self); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pin_memory_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pin_memory_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0d88e78a9928b805ab5f82a5647f4392043098e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pin_memory_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API pin_memory { + using schema = at::Tensor (const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::pin_memory"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "pin_memory(Tensor(a) self, Device? device=None) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, ::std::optional device); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional device); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_unshuffle.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_unshuffle.h new file mode 100644 index 0000000000000000000000000000000000000000..056da18946de69c9ef0b623f98d4e74a32584ff5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/pixel_unshuffle.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor +inline at::Tensor pixel_unshuffle(const at::Tensor & self, int64_t downscale_factor) { + return at::_ops::pixel_unshuffle::call(self, downscale_factor); +} + +// aten::pixel_unshuffle.out(Tensor self, int downscale_factor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & pixel_unshuffle_out(at::Tensor & out, const at::Tensor & self, int64_t downscale_factor) { + return at::_ops::pixel_unshuffle_out::call(self, downscale_factor, out); +} +// aten::pixel_unshuffle.out(Tensor self, int downscale_factor, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & pixel_unshuffle_outf(const at::Tensor & self, int64_t downscale_factor, at::Tensor & out) { + return at::_ops::pixel_unshuffle_out::call(self, downscale_factor, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4a1e7e202e07cabbc542b6ece25bc621dd1c3add --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/poisson_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & poisson_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & poisson_outf(const at::Tensor & self, ::std::optional generator, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/polar_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/polar_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cca96a403edc4ef42e7d40c34c8797761d405150 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/polar_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & polar_out(at::Tensor & out, const at::Tensor & abs, const at::Tensor & angle); +TORCH_API at::Tensor & polar_outf(const at::Tensor & abs, const at::Tensor & angle, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/positive.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/positive.h new file mode 100644 index 0000000000000000000000000000000000000000..a9727581c1888aa5c86fd276f3de4d10533cffbe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/positive.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::positive(Tensor(a) self) -> Tensor(a) +inline at::Tensor positive(const at::Tensor & self) { + return at::_ops::positive::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/prelu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/prelu.h new file mode 100644 index 0000000000000000000000000000000000000000..a246fe8f7cbd882e080d33878029cf71495e3ecd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/prelu.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::prelu(Tensor self, Tensor weight) -> Tensor +inline at::Tensor prelu(const at::Tensor & self, const at::Tensor & weight) { + return at::_ops::prelu::call(self, weight); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_per_channel_axis.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_per_channel_axis.h new file mode 100644 index 0000000000000000000000000000000000000000..820adab19470d0287dc8f53bca0748516caf07d3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_per_channel_axis.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::q_per_channel_axis(Tensor self) -> int +inline int64_t q_per_channel_axis(const at::Tensor & self) { + return at::_ops::q_per_channel_axis::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_scale_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_scale_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..25b82ea77704f4cad6cfd7549d140ac32ff467b0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_scale_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API q_scale { + using schema = double (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::q_scale"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "q_scale(Tensor self) -> float"; + static double call(const at::Tensor & self); + static double redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_zero_point_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_zero_point_native.h new file mode 100644 index 0000000000000000000000000000000000000000..32a7817231613b174793662cf15452d06691a7d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/q_zero_point_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API int64_t q_zero_point_quant(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/qr_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/qr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1c833024ceb6132438c7b9e973fc6de1189038a3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/qr_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API qr_Q { + using schema = ::std::tuple (const at::Tensor &, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::qr"; + static constexpr const char* overload_name = "Q"; + static constexpr const char* schema_str = "qr.Q(Tensor self, bool some=True, *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)"; + static ::std::tuple call(const at::Tensor & self, bool some, at::Tensor & Q, at::Tensor & R); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some, at::Tensor & Q, at::Tensor & R); +}; + +struct TORCH_API qr { + using schema = ::std::tuple (const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::qr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "qr(Tensor self, bool some=True) -> (Tensor Q, Tensor R)"; + static ::std::tuple call(const at::Tensor & self, bool some); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool some); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7d35c52e7de7b184712cfbe9050d0b6b7d4e6bd2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & quantize_per_channel_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype); +TORCH_API at::Tensor & quantize_per_channel_outf(const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..462c749506b0c7f75958b845e0d2857b55da34ea --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_channel_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor quantize_per_channel(const at::Tensor & self, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::ScalarType dtype); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_tensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..431bfc552555c8ff7e112a02205d228e2fb26449 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_tensor_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & quantize_per_tensor_out(const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor quantize_per_tensor(const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype); +TORCH_API at::Tensor & quantize_per_tensor_tensor_qparams_out(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor quantize_per_tensor_tensor_qparams(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype); +TORCH_API void quantize_per_tensor_tensors_out(at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype, at::TensorList out); +TORCH_API ::std::vector quantize_per_tensor_list_cpu(at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_tensor_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..601c66d636e47a4d99423b95403dbe5014c63fa0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantize_per_tensor_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API quantize_per_tensor { + using schema = at::Tensor (const at::Tensor &, double, int64_t, at::ScalarType); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantize_per_tensor"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "quantize_per_tensor(Tensor self, float scale, int zero_point, ScalarType dtype) -> Tensor"; + static at::Tensor call(const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype); +}; + +struct TORCH_API quantize_per_tensor_tensor_qparams { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantize_per_tensor"; + static constexpr const char* overload_name = "tensor_qparams"; + static constexpr const char* schema_str = "quantize_per_tensor.tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype); +}; + +struct TORCH_API quantize_per_tensor_tensors { + using schema = ::std::vector (at::TensorList, const at::Tensor &, const at::Tensor &, at::ScalarType); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantize_per_tensor"; + static constexpr const char* overload_name = "tensors"; + static constexpr const char* schema_str = "quantize_per_tensor.tensors(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype) -> Tensor[]"; + static ::std::vector call(at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype); +}; + +struct TORCH_API quantize_per_tensor_out { + using schema = at::Tensor & (const at::Tensor &, double, int64_t, at::ScalarType, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantize_per_tensor"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "quantize_per_tensor.out(Tensor self, float scale, int zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, double scale, int64_t zero_point, at::ScalarType dtype, at::Tensor & out); +}; + +struct TORCH_API quantize_per_tensor_tensor_qparams_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::ScalarType, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantize_per_tensor"; + static constexpr const char* overload_name = "tensor_qparams_out"; + static constexpr const char* schema_str = "quantize_per_tensor.tensor_qparams_out(Tensor self, Tensor scale, Tensor zero_point, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, at::ScalarType dtype, at::Tensor & out); +}; + +struct TORCH_API quantize_per_tensor_tensors_out { + using schema = void (at::TensorList, const at::Tensor &, const at::Tensor &, at::ScalarType, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantize_per_tensor"; + static constexpr const char* overload_name = "tensors_out"; + static constexpr const char* schema_str = "quantize_per_tensor.tensors_out(Tensor[] tensors, Tensor scales, Tensor zero_points, ScalarType dtype, *, Tensor(a!)[] out) -> ()"; + static void call(at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, const at::Tensor & scales, const at::Tensor & zero_points, at::ScalarType dtype, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_native.h new file mode 100644 index 0000000000000000000000000000000000000000..dc7d7045246995a8b89ded1ee51137c0b284daf7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor quantized_gru_cell(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..56b828db2704ed477a7fe536fe675877b98a8cd1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_gru_cell_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API quantized_gru_cell { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::quantized_gru_cell"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "quantized_gru_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor b_ih, Tensor b_hh, Tensor packed_ih, Tensor packed_hh, Tensor col_offsets_ih, Tensor col_offsets_hh, Scalar scale_ih, Scalar scale_hh, Scalar zero_point_ih, Scalar zero_point_hh) -> Tensor"; + static at::Tensor call(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool1d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool1d.h new file mode 100644 index 0000000000000000000000000000000000000000..75adb13aa390d0ffb3c49bfcd0138ba89d07375d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/quantized_max_pool1d.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor +inline at::Tensor quantized_max_pool1d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::quantized_max_pool1d::call(self, kernel_size, stride, padding, dilation, ceil_mode); +} + +// aten::quantized_max_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & quantized_max_pool1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::quantized_max_pool1d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} +// aten::quantized_max_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & quantized_max_pool1d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) { + return at::_ops::quantized_max_pool1d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rand_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rand_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..acf3e52a2d9715162468b34dc0cd202159de53fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rand_compositeimplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor & rand_out(at::Tensor & out, at::IntArrayRef size, ::std::optional generator); +TORCH_API at::Tensor & rand_outf(at::IntArrayRef size, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & rand_symint_out(at::Tensor & out, c10::SymIntArrayRef size, ::std::optional generator); +TORCH_API at::Tensor & rand_symint_outf(c10::SymIntArrayRef size, ::std::optional generator, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9dc867cac0dfd1cccb0903f6585a9c3afc010b78 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randn_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & randn_out(at::IntArrayRef size, at::Tensor & out); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randn_out(at::IntArrayRef size, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional generator, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional names, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randn_names_out_symint(c10::SymIntArrayRef size, ::std::optional names, at::Tensor & out); +TORCH_API at::Tensor randn(at::IntArrayRef size, ::std::optional generator, ::std::optional names, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randn_generator_with_names_out_symint(c10::SymIntArrayRef size, ::std::optional generator, ::std::optional names, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random.h new file mode 100644 index 0000000000000000000000000000000000000000..f59cfa4ca4c4e4b2d0cd6629acb454ef05d1bca6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_out(at::Tensor & out, const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_from_out::call(self, from, to, generator, out); +} +// aten::random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_outf(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator, at::Tensor & out) { + return at::_ops::random_from_out::call(self, from, to, generator, out); +} + +// aten::random.from(Tensor self, int from, int? to, *, Generator? generator=None) -> Tensor +inline at::Tensor random(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_from::call(self, from, to, generator); +} + +// aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_out(at::Tensor & out, const at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_to_out::call(self, to, generator, out); +} +// aten::random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_outf(const at::Tensor & self, int64_t to, ::std::optional generator, at::Tensor & out) { + return at::_ops::random_to_out::call(self, to, generator, out); +} + +// aten::random.to(Tensor self, int to, *, Generator? generator=None) -> Tensor +inline at::Tensor random(const at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt) { + return at::_ops::random_to::call(self, to, generator); +} + +// aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_out(at::Tensor & out, const at::Tensor & self, ::std::optional generator=::std::nullopt) { + return at::_ops::random_out::call(self, generator, out); +} +// aten::random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & random_outf(const at::Tensor & self, ::std::optional generator, at::Tensor & out) { + return at::_ops::random_out::call(self, generator, out); +} + +// aten::random(Tensor self, *, Generator? generator=None) -> Tensor +inline at::Tensor random(const at::Tensor & self, ::std::optional generator=::std::nullopt) { + return at::_ops::random::call(self, generator); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4300cc3bca26b9035cb2247303e73bf23beb1356 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_(at::Tensor & self, ::std::optional generator=::std::nullopt); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random_native.h new file mode 100644 index 0000000000000000000000000000000000000000..69f22285c64ec9f2ff8edf411b9bef6cf5595bcf --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/random_native.h @@ -0,0 +1,37 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor random(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_from_out(const at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_meta_(at::Tensor & self, int64_t from, ::std::optional to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor random(const at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_to_out(const at::Tensor & self, int64_t to, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & random_(at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_meta_(at::Tensor & self, int64_t to, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor random(const at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_out(const at::Tensor & self, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & random_(at::Tensor & self, ::std::optional generator=::std::nullopt); +TORCH_API at::Tensor & random_meta_(at::Tensor & self, ::std::optional generator=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dc31569e57cd31ce058473655e9b42192a76f115 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_compositeexplicitautograd_dispatch.h @@ -0,0 +1,39 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor randperm(int64_t n, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor randperm(int64_t n, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor randperm_symint(c10::SymInt n, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor randperm_symint(c10::SymInt n, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor & randperm_out(at::Tensor & out, int64_t n); +TORCH_API at::Tensor & randperm_outf(int64_t n, at::Tensor & out); +TORCH_API at::Tensor & randperm_symint_out(at::Tensor & out, c10::SymInt n); +TORCH_API at::Tensor & randperm_symint_outf(c10::SymInt n, at::Tensor & out); +TORCH_API at::Tensor randperm(int64_t n, ::std::optional generator, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor randperm(int64_t n, ::std::optional generator, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor randperm_symint(c10::SymInt n, ::std::optional generator, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor randperm_symint(c10::SymInt n, ::std::optional generator, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..63c5039a7fabec6c312a32d44b2b6fafdb60f9b6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/randperm_native.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor randperm(int64_t n, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randperm_out(int64_t n, at::Tensor & out); +TORCH_API at::Tensor randperm(int64_t n, ::std::optional generator, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor & randperm_out_cpu(int64_t n, ::std::optional generator, at::Tensor & out); +TORCH_API at::Tensor & randperm_out_cuda(int64_t n, ::std::optional generator, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ebd90b12b94dc4233c9dfbdc71a22a2bb2449411 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reciprocal_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor reciprocal(const at::Tensor & self); +TORCH_API at::Tensor & reciprocal_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & reciprocal_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & reciprocal_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names_native.h new file mode 100644 index 0000000000000000000000000000000000000000..15d13b42f65bc54988404e572389c02a0666fd51 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/refine_names_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor refine_names(const at::Tensor & self, at::DimnameList names); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d.h new file mode 100644 index 0000000000000000000000000000000000000000..be7051f204ba129ec4ac8590c0bfdf75f7e2dae7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad1d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template >> + at::Tensor & reflection_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template >> + at::Tensor & reflection_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad1d_out::call(self, padding, out); +} +namespace symint { + template >> + at::Tensor & reflection_pad1d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad1d_out::call(self, padding, out); + } +} + +// aten::reflection_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad1d_out::call(self, padding, out); +} +namespace symint { + template >> + at::Tensor & reflection_pad1d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad1d_out::call(self, padding, out); + } +} + +// aten::reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor reflection_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template >> + at::Tensor reflection_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::reflection_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor reflection_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad1d::call(self, padding); +} +namespace symint { + template >> + at::Tensor reflection_pad1d(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad1d::call(self, padding); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..e6bbaa7aa250c67f098fa0dd8aca0162607520ee --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); + } +} + +// aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); + } +} + +// aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, padding, grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, padding, grad_input); + } +} + +// aten::reflection_pad2d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[4] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, padding, grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad2d_backward_grad_input::call(grad_output, self, padding, grad_input); + } +} + +// aten::reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor +inline at::Tensor reflection_pad2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d_backward::call(grad_output, self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template >> + at::Tensor reflection_pad2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d_backward::call(grad_output, self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::reflection_pad2d_backward(Tensor grad_output, Tensor self, SymInt[4] padding) -> Tensor +inline at::Tensor reflection_pad2d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d_backward::call(grad_output, self, padding); +} +namespace symint { + template >> + at::Tensor reflection_pad2d_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d_backward::call(grad_output, self, padding); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..88de6008e364a83674c442b019d1f36093ec8d9e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad2d_backward_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor reflection_pad2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor reflection_pad2d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & reflection_pad2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor & reflection_pad2d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad2d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..6471d0566257f4f317d11371d399309a2240bd38 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_backward.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); + } +} + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, c10::fromIntArrayRefSlow(padding), grad_input); + } +} + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); + } +} + +// aten::reflection_pad3d_backward.grad_input(Tensor grad_output, Tensor self, SymInt[6] padding, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & reflection_pad3d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); +} +namespace symint { + template >> + at::Tensor & reflection_pad3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input) { + return at::_ops::reflection_pad3d_backward_grad_input::call(grad_output, self, padding, grad_input); + } +} + +// aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor +inline at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template >> + at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::reflection_pad3d_backward(Tensor grad_output, Tensor self, SymInt[6] padding) -> Tensor +inline at::Tensor reflection_pad3d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, padding); +} +namespace symint { + template >> + at::Tensor reflection_pad3d_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad3d_backward::call(grad_output, self, padding); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..48f8dfc7f585bbdea59badc6e04da0ddff4b4411 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_meta_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor reflection_pad3d(const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor reflection_pad3d_symint(const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & reflection_pad3d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out); +TORCH_API at::Tensor & reflection_pad3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & reflection_pad3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..8ba2f00bd7344e848d28d557206a341a2260ec23 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reflection_pad3d_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_reflection_pad3d_out_cpu : public at::meta::structured_reflection_pad3d { +void impl(const at::Tensor & self, at::ArrayRef padding, const at::Tensor & out); +}; +struct TORCH_API structured_reflection_pad3d_out_cuda : public at::meta::structured_reflection_pad3d { +void impl(const at::Tensor & self, at::ArrayRef padding, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0b8c5e8626fae2fa358a3afc322023e535828761 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor relu6(const at::Tensor & self); +TORCH_API at::Tensor & relu6_(at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/renorm_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/renorm_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a2e323d1522d7805d2e7a0b37ef7ff88053002ae --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/renorm_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor renorm(const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm); +TORCH_API at::Tensor & renorm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm); +TORCH_API at::Tensor & renorm_outf(const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm, at::Tensor & out); +TORCH_API at::Tensor & renorm_(at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_interleave_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_interleave_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cadd7843f88e0c32bc84b62ce9cb504f1ac929e9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/repeat_interleave_compositeexplicitautograd_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & repeat_interleave_out(at::Tensor & out, const at::Tensor & repeats, ::std::optional output_size=::std::nullopt); +TORCH_API at::Tensor & repeat_interleave_outf(const at::Tensor & repeats, ::std::optional output_size, at::Tensor & out); +TORCH_API at::Tensor & repeat_interleave_symint_out(at::Tensor & out, const at::Tensor & repeats, ::std::optional output_size=::std::nullopt); +TORCH_API at::Tensor & repeat_interleave_symint_outf(const at::Tensor & repeats, ::std::optional output_size, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d.h new file mode 100644 index 0000000000000000000000000000000000000000..476f4eb42f4b0885f739004cbedbf89b06cab8af --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d.h @@ -0,0 +1,97 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template >> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template >> + at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); +} +namespace symint { + template >> + at::Tensor replication_pad1d(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d_backward_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c27c13d85d0785cb2a4ef519a252101d0aaec312 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad1d_backward_meta_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor replication_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor replication_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & replication_pad1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad1d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor & replication_pad1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & replication_pad1d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..a755ac6b610577292fee95e07610badad5b02214 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_backward_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor replication_pad2d_backward_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_backward_out_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +TORCH_API at::Tensor replication_pad2d_backward_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_backward_out_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..13582ad7eb7b0ee19c0c23cca7f88f415b622e44 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_meta_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor replication_pad2d(const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor replication_pad2d_symint(const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out); +TORCH_API at::Tensor & replication_pad2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding); +TORCH_API at::Tensor & replication_pad2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7f93dc1e2510491a751ec9f066a6d431cd08dacb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/replication_pad2d_native.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_replication_pad2d_out_cpu : public at::meta::structured_replication_pad2d { +void impl(const at::Tensor & self, at::ArrayRef padding, const at::Tensor & out); +}; +struct TORCH_API structured_replication_pad2d_out_cuda : public at::meta::structured_replication_pad2d { +void impl(const at::Tensor & self, at::ArrayRef padding, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/requires_grad_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/requires_grad_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..62b6ae799f9aee983f27d925cd755860b28ec6d6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/requires_grad_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API requires_grad_ { + using schema = at::Tensor & (at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::requires_grad_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "requires_grad_(Tensor(a!) self, bool requires_grad=True) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, bool requires_grad); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, bool requires_grad); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reshape_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reshape_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..cef9dcc5a22f1b54ff637d27c8aaf37e4a80d74b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/reshape_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API reshape { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::reshape"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "reshape(Tensor(a) self, SymInt[] shape) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef shape); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef shape); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resize_as_sparse_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resize_as_sparse_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2e4a7f6a932530181d815db882ea17b7500769d8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resize_as_sparse_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor resize_as_sparse(const at::Tensor & self, const at::Tensor & the_template); +TORCH_API const at::Tensor & resize_as_sparse_out(const at::Tensor & self, const at::Tensor & the_template, const at::Tensor & out); +TORCH_API const at::Tensor & resize_as_sparse_(const at::Tensor & self, const at::Tensor & the_template); +TORCH_API const at::Tensor & resize_as_sparse_compressed_(const at::Tensor & self, const at::Tensor & the_template); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resize_as_sparse_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resize_as_sparse_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c41bac473746248e1475bc9990ec4adebeeba24a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/resize_as_sparse_ops.h @@ -0,0 +1,56 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API resize_as_sparse_ { + using schema = const at::Tensor & (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::resize_as_sparse_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "resize_as_sparse_(Tensor(a!) self, Tensor the_template) -> Tensor(a!)"; + static const at::Tensor & call(const at::Tensor & self, const at::Tensor & the_template); + static const at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template); +}; + +struct TORCH_API resize_as_sparse_out { + using schema = const at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::resize_as_sparse"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "resize_as_sparse.out(Tensor self, Tensor the_template, *, Tensor(a!) out) -> Tensor(a!)"; + static const at::Tensor & call(const at::Tensor & self, const at::Tensor & the_template, const at::Tensor & out); + static const at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template, const at::Tensor & out); +}; + +struct TORCH_API resize_as_sparse { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::resize_as_sparse"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "resize_as_sparse(Tensor self, Tensor the_template) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & the_template); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & the_template); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rms_norm_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rms_norm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..74841080c607e5a7207580605355b58e1c32ac2c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rms_norm_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API rms_norm { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, const ::std::optional &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::rms_norm"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "rms_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor"; + static at::Tensor call(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, ::std::optional eps); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const ::std::optional & weight, ::std::optional eps); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/round_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/round_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8415e5b065dc0d04d364a860e6f9d0fabb3bd644 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/round_cuda_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor round(const at::Tensor & self); +TORCH_API at::Tensor & round_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & round_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & round_(at::Tensor & self); +TORCH_API at::Tensor round(const at::Tensor & self, int64_t decimals); +TORCH_API at::Tensor & round_out(at::Tensor & out, const at::Tensor & self, int64_t decimals); +TORCH_API at::Tensor & round_outf(const at::Tensor & self, int64_t decimals, at::Tensor & out); +TORCH_API at::Tensor & round_(at::Tensor & self, int64_t decimals); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..171e4bde6e657894ce7a30a4f321703f1d42ddf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_indices_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API row_indices { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::row_indices"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "row_indices(Tensor(a) self) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_stack_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_stack_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9da0653ccb336d9caddf60ab49dcca970648a23b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/row_stack_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor row_stack(at::TensorList tensors); +TORCH_API at::Tensor & row_stack_out(at::TensorList tensors, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rrelu.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rrelu.h new file mode 100644 index 0000000000000000000000000000000000000000..f146c3c41fe05cc6730e25cc25af27c8ddca2f79 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rrelu.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::rrelu(Tensor self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor +inline at::Tensor rrelu(const at::Tensor & self, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, ::std::optional generator=::std::nullopt) { + return at::_ops::rrelu::call(self, lower, upper, training, generator); +} + +// aten::rrelu_(Tensor(a!) self, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!) +inline at::Tensor & rrelu_(at::Tensor & self, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, ::std::optional generator=::std::nullopt) { + return at::_ops::rrelu_::call(self, lower, upper, training, generator); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rrelu_with_noise_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rrelu_with_noise_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0044d72078eecbfa2a3b5d17079f65483cc53911 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rrelu_with_noise_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor rrelu_with_noise_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result); +TORCH_API at::Tensor & rrelu_with_noise_backward_out(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rsub_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rsub_native.h new file mode 100644 index 0000000000000000000000000000000000000000..084a55157a4316b45b0af72e32895026fd0ab5c7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/rsub_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & rsub_Tensor_out(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor rsub(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor rsub(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & rsub_Scalar_out(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scalar_tensor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scalar_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..fca9c132c977795534152c4d544fa30bbcc5c98d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scalar_tensor.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor scalar_tensor(const at::Scalar & s, at::TensorOptions options={}) { + return at::_ops::scalar_tensor::call(s, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::scalar_tensor(Scalar s, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor scalar_tensor(const at::Scalar & s, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::scalar_tensor::call(s, dtype, layout, device, pin_memory); +} + +// aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & scalar_tensor_out(at::Tensor & out, const at::Scalar & s) { + return at::_ops::scalar_tensor_out::call(s, out); +} +// aten::scalar_tensor.out(Scalar s, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & scalar_tensor_outf(const at::Scalar & s, at::Tensor & out) { + return at::_ops::scalar_tensor_out::call(s, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..da125dc4de2953f415bde4577cb8bd06548b5028 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_cuda_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor scatter_add(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor & scatter_add_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor & scatter_add_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out); +TORCH_API at::Tensor & scatter_add_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..53f9498c24c07fbcd3f755f772517cf69ce8f9be --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor scatter_add(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor & scatter_add_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor & scatter_add_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out); +TORCH_API at::Tensor & scatter_add_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_native.h new file mode 100644 index 0000000000000000000000000000000000000000..62ca3a45f969329d37a4a11dd5f5a691ee047776 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_scatter_add : public at::meta::structured_scatter_add { +void impl(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, const at::Tensor & out); +}; +TORCH_API at::Tensor scatter_add(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bd1910c0004f7d90b86b51104d7c76e881718ab9 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_add_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API scatter_add { + using schema = at::Tensor (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_add"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "scatter_add(Tensor self, int dim, Tensor index, Tensor src) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +}; + +struct TORCH_API scatter_add_ { + using schema = at::Tensor & (at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_add_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "scatter_add_(Tensor(a!) self, int dim, Tensor index, Tensor src) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +}; + +struct TORCH_API scatter_add_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_add"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "scatter_add.out(Tensor self, int dim, Tensor index, Tensor src, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out); +}; + +struct TORCH_API scatter_add_dimname { + using schema = at::Tensor (const at::Tensor &, at::Dimname, const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::scatter_add"; + static constexpr const char* overload_name = "dimname"; + static constexpr const char* schema_str = "scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b1fe098221c696101efe81e13fbaff51c0d16a03 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_cuda_dispatch.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor & scatter_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor & scatter_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, at::Tensor & out); +TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src); +TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value); +TORCH_API at::Tensor & scatter_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value); +TORCH_API at::Tensor & scatter_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, at::Tensor & out); +TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value); +TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce); +TORCH_API at::Tensor & scatter_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce); +TORCH_API at::Tensor & scatter_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, at::Tensor & out); +TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce); +TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce); +TORCH_API at::Tensor & scatter_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce); +TORCH_API at::Tensor & scatter_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce, at::Tensor & out); +TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..240681f3864c96bc9acaa270e141ef1064b0fce3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor scatter_reduce(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); +TORCH_API at::Tensor & scatter_reduce_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); +TORCH_API at::Tensor & scatter_reduce_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); +TORCH_API at::Tensor & scatter_reduce_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..be06f5829f85bc2a0446c26e7eb425e5dac41f14 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/scatter_reduce_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor scatter_reduce(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); +TORCH_API at::Tensor & scatter_reduce_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); +TORCH_API at::Tensor & scatter_reduce_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out); +TORCH_API at::Tensor & scatter_reduce_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/select_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/select_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8f2d0f0cc72cf825bd45d663b0500211d09e00b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/select_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor select_backward(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index); +TORCH_API at::Tensor select_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set.h new file mode 100644 index 0000000000000000000000000000000000000000..e03dd0480212a49e57577e9458547c086a42f6b8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set.h @@ -0,0 +1,167 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set__source_Storage_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); + } +} + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set__source_Storage_storage_offset::call(self, source, storage_offset, size, stride); + } +} + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, const at::Tensor & source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set__source_Tensor_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); + } +} + +namespace symint { + template >> + at::Tensor & set_(at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set__source_Tensor_storage_offset::call(self, source, storage_offset, size, stride); + } +} + +// aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source) { + return at::_ops::set_source_Storage_out::call(self, source, out); +} +// aten::set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, at::Storage source, at::Tensor & out) { + return at::_ops::set_source_Storage_out::call(self, source, out); +} + +// aten::set.source_Storage(Tensor self, Storage source) -> Tensor +inline at::Tensor set(const at::Tensor & self, at::Storage source) { + return at::_ops::set_source_Storage::call(self, source); +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); +} +namespace symint { + template >> + at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); + } +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); +} +namespace symint { + template >> + at::Tensor & set_outf(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), out); + } +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_symint_out(at::Tensor & out, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); +} +namespace symint { + template >> + at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); + } +} + +// aten::set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_symint_outf(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); +} +namespace symint { + template >> + at::Tensor & set_outf(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out) { + return at::_ops::set_source_Storage_storage_offset_out::call(self, source, storage_offset, size, stride, out); + } +} + +// aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor +inline at::Tensor set(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); +} +namespace symint { + template >> + at::Tensor set(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride)); + } +} + +// aten::set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor +inline at::Tensor set_symint(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, size, stride); +} +namespace symint { + template >> + at::Tensor set(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}) { + return at::_ops::set_source_Storage_storage_offset::call(self, source, storage_offset, size, stride); + } +} + +// aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & source) { + return at::_ops::set_source_Tensor_out::call(self, source, out); +} +// aten::set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, const at::Tensor & source, at::Tensor & out) { + return at::_ops::set_source_Tensor_out::call(self, source, out); +} + +// aten::set.source_Tensor(Tensor self, Tensor source) -> Tensor +inline at::Tensor set(const at::Tensor & self, const at::Tensor & source) { + return at::_ops::set_source_Tensor::call(self, source); +} + +// aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::set_out::call(self, out); +} +// aten::set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & set_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::set_out::call(self, out); +} + +// aten::set(Tensor self) -> Tensor +inline at::Tensor set(const at::Tensor & self) { + return at::_ops::set::call(self); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d1ad453d163a4bfa7c2c80f01f46519850e94c36 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_compositeexplicitautograd_dispatch.h @@ -0,0 +1,42 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor set(const at::Tensor & self, at::Storage source); +TORCH_API at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source); +TORCH_API at::Tensor & set_outf(const at::Tensor & self, at::Storage source, at::Tensor & out); +TORCH_API at::Tensor set(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}); +TORCH_API at::Tensor set_symint(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}); +TORCH_API at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}); +TORCH_API at::Tensor & set_outf(const at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride, at::Tensor & out); +TORCH_API at::Tensor & set_symint_out(at::Tensor & out, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}); +TORCH_API at::Tensor & set_symint_outf(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); +TORCH_API at::Tensor set(const at::Tensor & self, const at::Tensor & source); +TORCH_API at::Tensor & set_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & source); +TORCH_API at::Tensor & set_outf(const at::Tensor & self, const at::Tensor & source, at::Tensor & out); +TORCH_API at::Tensor set(const at::Tensor & self); +TORCH_API at::Tensor & set_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & set_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_data_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_data_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d827388d4de650777ec8dfb7a0d03f21ed883d05 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_data_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API set_data { + using schema = void (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_data"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "set_data(Tensor(a!) self, Tensor new_data) -> ()"; + static void call(at::Tensor & self, const at::Tensor & new_data); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & new_data); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ecda237baf44815149aecc31a8bb4316fbf21902 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_native.h @@ -0,0 +1,43 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor set(const at::Tensor & self, at::Storage source); +TORCH_API at::Tensor & set_source_Storage_out(const at::Tensor & self, at::Storage source, at::Tensor & out); +TORCH_API at::Tensor & set_(at::Tensor & self, at::Storage source); +TORCH_API at::Tensor set_symint(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}); +TORCH_API at::Tensor & set_source_Storage_storage_offset_out_symint(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); +TORCH_API at::Tensor & set_storage_cpu_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}); +TORCH_API at::Tensor & set_storage_cuda_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}); +TORCH_API at::Tensor & set_storage_meta__symint(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}); +TORCH_API at::Tensor & set_storage_quantized_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={}); +TORCH_API at::Tensor & set__symint(at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={}); +TORCH_API at::Tensor set(const at::Tensor & self, const at::Tensor & source); +TORCH_API at::Tensor & set_source_Tensor_out(const at::Tensor & self, const at::Tensor & source, at::Tensor & out); +TORCH_API at::Tensor & set_tensor_(at::Tensor & self, const at::Tensor & source); +TORCH_API at::Tensor set(const at::Tensor & self); +TORCH_API at::Tensor & set_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & set_cpu_(at::Tensor & self); +TORCH_API at::Tensor & set_cuda_(at::Tensor & self); +TORCH_API at::Tensor & set_meta_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..49701c440080348ec412c01b51dcd31a4b1c7744 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/set_ops.h @@ -0,0 +1,166 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API set__source_Storage { + using schema = at::Tensor & (at::Tensor &, at::Storage); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Storage"; + static constexpr const char* schema_str = "set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, at::Storage source); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source); +}; + +struct TORCH_API set__source_Storage_storage_offset { + using schema = at::Tensor & (at::Tensor &, at::Storage, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Storage_storage_offset"; + static constexpr const char* schema_str = "set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +struct TORCH_API set__source_Tensor_storage_offset { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Tensor_storage_offset"; + static constexpr const char* schema_str = "set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +struct TORCH_API set__source_Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = "source_Tensor"; + static constexpr const char* schema_str = "set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & source); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & source); +}; + +struct TORCH_API set_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set_"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "set_(Tensor(a!) self) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API set_source_Storage_out { + using schema = at::Tensor & (const at::Tensor &, at::Storage, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage_out"; + static constexpr const char* schema_str = "set.source_Storage_out(Tensor self, Storage source, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Storage source, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, at::Tensor & out); +}; + +struct TORCH_API set_source_Storage { + using schema = at::Tensor (const at::Tensor &, at::Storage); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage"; + static constexpr const char* schema_str = "set.source_Storage(Tensor self, Storage source) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::Storage source); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source); +}; + +struct TORCH_API set_source_Storage_storage_offset_out { + using schema = at::Tensor & (const at::Tensor &, at::Storage, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage_storage_offset_out"; + static constexpr const char* schema_str = "set.source_Storage_storage_offset_out(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[], *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out); +}; + +struct TORCH_API set_source_Storage_storage_offset { + using schema = at::Tensor (const at::Tensor &, at::Storage, c10::SymInt, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Storage_storage_offset"; + static constexpr const char* schema_str = "set.source_Storage_storage_offset(Tensor self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride); +}; + +struct TORCH_API set_source_Tensor_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Tensor_out"; + static constexpr const char* schema_str = "set.source_Tensor_out(Tensor self, Tensor source, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & source, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & source, at::Tensor & out); +}; + +struct TORCH_API set_source_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "source_Tensor"; + static constexpr const char* schema_str = "set.source_Tensor(Tensor self, Tensor source) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & source); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & source); +}; + +struct TORCH_API set_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "set.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +struct TORCH_API set { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::set"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "set(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..dff6caa7b9867f852d318e50dc019e2c3af2e2f5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_backward_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & output); +TORCH_API at::Tensor & sigmoid_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & output); +TORCH_API at::Tensor & sigmoid_backward_outf(const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..805a812d676f2d8f3b0d349bc9c2b11f54ca5a05 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sigmoid_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor sigmoid(const at::Tensor & self); +TORCH_API at::Tensor & sigmoid_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & sigmoid_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & sigmoid_(at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..8ad7819c71a4b62a8f346550817e51216d0e5695 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API silu_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::silu_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "silu_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input); +}; + +struct TORCH_API silu_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::silu_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "silu_backward(Tensor grad_output, Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..005c2783c923007597c651bcc80622f19d44490b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor silu(const at::Tensor & self); +TORCH_API at::Tensor & silu_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & silu_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & silu_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sinh.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sinh.h new file mode 100644 index 0000000000000000000000000000000000000000..fcdebf78e2de9684fe3e818120bf94956452f845 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sinh.h @@ -0,0 +1,50 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::sinh(Tensor self) -> Tensor +inline at::Tensor sinh(const at::Tensor & self) { + return at::_ops::sinh::call(self); +} + +// aten::sinh_(Tensor(a!) self) -> Tensor(a!) +inline at::Tensor & sinh_(at::Tensor & self) { + return at::_ops::sinh_::call(self); +} + +// aten::sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & sinh_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::sinh_out::call(self, out); +} +// aten::sinh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & sinh_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::sinh_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/size_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/size_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..79827da20501da1fc2eedcebd880cd82e59bf4aa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/size_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API size_int { + using schema = int64_t (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::size"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "size.int(Tensor self, int dim) -> int"; + static int64_t call(const at::Tensor & self, int64_t dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim); +}; + +struct TORCH_API size_Dimname { + using schema = int64_t (const at::Tensor &, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::size"; + static constexpr const char* overload_name = "Dimname"; + static constexpr const char* schema_str = "size.Dimname(Tensor self, Dimname dim) -> int"; + static int64_t call(const at::Tensor & self, at::Dimname dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice.h new file mode 100644 index 0000000000000000000000000000000000000000..f6a3da46b38aee34578f0771652c84b4dce5774e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice.h @@ -0,0 +1,53 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) +inline at::Tensor slice(const at::Tensor & self, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1) { + return at::_ops::slice_Tensor::call(self, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step); +} +namespace symint { + template >> + at::Tensor slice(const at::Tensor & self, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1) { + return at::_ops::slice_Tensor::call(self, dim, start.has_value() ? ::std::make_optional(c10::SymInt(*start)) : ::std::nullopt, end.has_value() ? ::std::make_optional(c10::SymInt(*end)) : ::std::nullopt, step); + } +} + +// aten::slice.Tensor(Tensor(a) self, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) +inline at::Tensor slice_symint(const at::Tensor & self, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, c10::SymInt step=1) { + return at::_ops::slice_Tensor::call(self, dim, start, end, step); +} +namespace symint { + template >> + at::Tensor slice(const at::Tensor & self, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, c10::SymInt step=1) { + return at::_ops::slice_Tensor::call(self, dim, start, end, step); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f4f10eca089c0306ffab90fb54093808a993c8e6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_backward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor slice_backward(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step); +TORCH_API at::Tensor & slice_backward_out_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..653767db3160e7c1c23bed481151a298d84f72d7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_inverse_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API slice_inverse { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t, ::std::optional, ::std::optional, c10::SymInt); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::slice_inverse"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a)"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional start, ::std::optional end, c10::SymInt step); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional start, ::std::optional end, c10::SymInt step); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_scatter_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_scatter_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b186bfa3d4d7806e8108ad2285be9c9d38765595 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slice_scatter_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & slice_scatter_out_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional start, ::std::optional end, c10::SymInt step, at::Tensor & out); +TORCH_API at::Tensor slice_scatter(const at::Tensor & self, const at::Tensor & src, int64_t dim=0, ::std::optional start=::std::nullopt, ::std::optional end=::std::nullopt, int64_t step=1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slogdet_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slogdet_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d024fde78757f49c2f4c142f36da9c374fb369ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slogdet_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API ::std::tuple slogdet(const at::Tensor & self); +TORCH_API ::std::tuple slogdet_out(at::Tensor & sign, at::Tensor & logabsdet, const at::Tensor & self); +TORCH_API ::std::tuple slogdet_outf(const at::Tensor & self, at::Tensor & sign, at::Tensor & logabsdet); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7f4e0b4ae360fbd04b38ad67cde7c655500f73ff --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor slow_conv3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0); +TORCH_API at::Tensor slow_conv3d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)); +TORCH_API at::Tensor & slow_conv3d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0); +TORCH_API at::Tensor & slow_conv3d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out); +TORCH_API at::Tensor & slow_conv3d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0)); +TORCH_API at::Tensor & slow_conv3d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_forward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d7400b77e89c7cdedc67ecce69dbaabd5fd5fac0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv3d_forward_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor slow_conv3d_forward_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding); +TORCH_API at::Tensor & slow_conv3d_forward_out_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & output); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4d6ce6157fe4386d1641cc385245ddae8c4e75fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor slow_conv_dilated3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor slow_conv_dilated3d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..bcaefdef1e830257b247e58d88ca612df752eaac --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor slow_conv_transpose2d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor slow_conv_transpose2d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); +TORCH_API at::Tensor & slow_conv_transpose2d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor & slow_conv_transpose2d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef dilation, at::Tensor & out); +TORCH_API at::Tensor & slow_conv_transpose2d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); +TORCH_API at::Tensor & slow_conv_transpose2d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c7838ee18c49287188f031d69c0bc7313f1a20d2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor smooth_l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ca9e947fe85c7848d88b2eec17e93386437f67f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/smooth_l1_loss_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor smooth_l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0); +TORCH_API at::Tensor & smooth_l1_loss_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, double beta=1.0); +TORCH_API at::Tensor & smooth_l1_loss_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softmax_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softmax_native.h new file mode 100644 index 0000000000000000000000000000000000000000..2763bf6f62ef0138cde3b9df2f9cc61933819efa --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softmax_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor softmax(const at::Tensor & self, int64_t dim, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & softmax_out(const at::Tensor & self, int64_t dim, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor softmax(const at::Tensor & self, at::Dimname dim, ::std::optional dtype=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1d4e0d0352f407d85617ecd5f222869e0eca2de0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor softplus_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +TORCH_API at::Tensor & softplus_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +TORCH_API at::Tensor & softplus_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4af5e875e9e191a6ca174e693b27bc08e6fd0487 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API softplus_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::softplus_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "softplus_backward.grad_input(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input); +}; + +struct TORCH_API softplus_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::softplus_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "softplus_backward(Tensor grad_output, Tensor self, Scalar beta, Scalar threshold) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4a6b9a1aa406912e9cf2576145f40d59ba81008b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softplus_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API softplus_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::softplus"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "softplus.out(Tensor self, Scalar beta=1, Scalar threshold=20, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out); +}; + +struct TORCH_API softplus { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::softplus"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "softplus(Tensor self, Scalar beta=1, Scalar threshold=20) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_backward_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fe3e190705e94baac05c35e4128463e78c5b53dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor softshrink_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & lambd); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_backward_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4e2b15b1d74ed41c8eb159a60b96f6d41fbfa611 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_backward_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_softshrink_backward_out : public at::meta::structured_softshrink_backward { +void impl(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & lambd, const at::Tensor & grad_input); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3c768a0ff295b4d9d9add075d7fb86302eb97119 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/softshrink_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor softshrink(const at::Tensor & self, const at::Scalar & lambd=0.5); +TORCH_API at::Tensor & softshrink_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & lambd=0.5); +TORCH_API at::Tensor & softshrink_outf(const at::Tensor & self, const at::Scalar & lambd, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sort_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sort_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7db065e2087343b78e6aab64ed8a25667df997b7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sort_ops.h @@ -0,0 +1,111 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API sort_values { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "values"; + static constexpr const char* schema_str = "sort.values(Tensor self, int dim=-1, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort_values_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, int64_t, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "values_stable"; + static constexpr const char* schema_str = "sort.values_stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort { + using schema = ::std::tuple (const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "sort(Tensor self, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, int64_t dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool descending); +}; + +struct TORCH_API sort_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "stable"; + static constexpr const char* schema_str = "sort.stable(Tensor self, *, bool? stable, int dim=-1, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, int64_t dim, bool descending); +}; + +struct TORCH_API sort_dimname_values { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname_values"; + static constexpr const char* schema_str = "sort.dimname_values(Tensor self, Dimname dim, bool descending=False, *, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort_dimname_values_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, at::Dimname, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname_values_stable"; + static constexpr const char* schema_str = "sort.dimname_values_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False, Tensor(a!) values, Tensor(b!) indices) -> (Tensor(a!) values, Tensor(b!) indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending, at::Tensor & values, at::Tensor & indices); +}; + +struct TORCH_API sort_dimname { + using schema = ::std::tuple (const at::Tensor &, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname"; + static constexpr const char* schema_str = "sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, at::Dimname dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool descending); +}; + +struct TORCH_API sort_dimname_stable { + using schema = ::std::tuple (const at::Tensor &, ::std::optional, at::Dimname, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sort"; + static constexpr const char* overload_name = "dimname_stable"; + static constexpr const char* schema_str = "sort.dimname_stable(Tensor self, *, bool? stable, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)"; + static ::std::tuple call(const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional stable, at::Dimname dim, bool descending); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsc_tensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsc_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..85b9d715b3330f41b9a267d8be83002834940750 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsc_tensor_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..37913ff300d00cf8bd67588f2cddab5a7c6ed250 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +TORCH_API at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype={}, ::std::optional layout={}, ::std::optional device={}, ::std::optional pin_memory={}); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e9c02910cd79a5376ebf536b06a7f7215e5b132d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_bsr_tensor_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API sparse_bsr_tensor_crow_col_value_size { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sparse_bsr_tensor"; + static constexpr const char* overload_name = "crow_col_value_size"; + static constexpr const char* schema_str = "sparse_bsr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor"; + static at::Tensor call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +struct TORCH_API sparse_bsr_tensor_crow_col_value { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, ::std::optional, ::std::optional, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::sparse_bsr_tensor"; + static constexpr const char* overload_name = "crow_col_value"; + static constexpr const char* schema_str = "sparse_bsr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor"; + static at::Tensor call(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_coo_tensor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_coo_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..b00fab9f947d27b951279cd39c424e9cf1f452ba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_coo_tensor.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_coo_tensor(at::IntArrayRef size, at::TensorOptions options) { + return at::_ops::sparse_coo_tensor_size::call(size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::sparse_coo_tensor.size(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_coo_tensor(at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::sparse_coo_tensor_size::call(size, dtype, layout, device, pin_memory); +} + +// aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options={}, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::sparse_coo_tensor_indices::call(indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); +} +// aten::sparse_coo_tensor.indices(Tensor indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::sparse_coo_tensor_indices::call(indices, values, dtype, layout, device, pin_memory, is_coalesced); +} + +// aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}, ::std::optional is_coalesced=::std::nullopt) { + return at::_ops::sparse_coo_tensor_indices_size::call(indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), is_coalesced); +} +// aten::sparse_coo_tensor.indices_size(Tensor indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool? is_coalesced=None) -> Tensor +inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional is_coalesced) { + return at::_ops::sparse_coo_tensor_indices_size::call(indices, values, size, dtype, layout, device, pin_memory, is_coalesced); +} + +// aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & sparse_coo_tensor_out(at::Tensor & out, at::IntArrayRef size) { + return at::_ops::sparse_coo_tensor_size_out::call(size, out); +} +// aten::sparse_coo_tensor.size_out(int[] size, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & sparse_coo_tensor_outf(at::IntArrayRef size, at::Tensor & out) { + return at::_ops::sparse_coo_tensor_size_out::call(size, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_csr_tensor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_csr_tensor.h new file mode 100644 index 0000000000000000000000000000000000000000..227ce92d02cee7d825c6e174834f0adcc56c93ec --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_csr_tensor.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) { + return at::_ops::sparse_csr_tensor_crow_col_value_size::call(crow_indices, col_indices, values, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::sparse_csr_tensor.crow_col_value_size(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::sparse_csr_tensor_crow_col_value_size::call(crow_indices, col_indices, values, size, dtype, layout, device, pin_memory); +} + +// aten::sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) { + return at::_ops::sparse_csr_tensor_crow_col_value::call(crow_indices, col_indices, values, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::sparse_csr_tensor.crow_col_value(Tensor crow_indices, Tensor col_indices, Tensor values, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor +inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::sparse_csr_tensor_crow_col_value::call(crow_indices, col_indices, values, dtype, layout, device, pin_memory); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3e6767d9353ac856669de7431f3bc2ec992185ed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sparse_resize_compositeexplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor sparse_resize(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); +TORCH_API const at::Tensor & sparse_resize_out(const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim); +TORCH_API const at::Tensor & sparse_resize_outf(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0e7abf778e8160d51a2ac44b3af212a867330f63 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor special_bessel_j0(const at::Tensor & self); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..bd227e18fd8411a52977bc99c323e008043abe76 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_j0_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_bessel_j0 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_j0"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_bessel_j0(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_bessel_j0_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_j0"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0.h new file mode 100644 index 0000000000000000000000000000000000000000..0909eb086515d185e7c012ea944cf5749e8b5bdb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_bessel_y0(Tensor self) -> Tensor +inline at::Tensor special_bessel_y0(const at::Tensor & self) { + return at::_ops::special_bessel_y0::call(self); +} + +// aten::special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_bessel_y0_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_bessel_y0_out::call(self, out); +} +// aten::special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_bessel_y0_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_bessel_y0_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..e142b2693cadd9efe0e2e982b0e6d881a60f8095 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y0_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_bessel_y0 { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_y0"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_bessel_y0(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_bessel_y0_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_bessel_y0"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03cbf711476beb57ce2615b06c2a29495da1375f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_bessel_y1(const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_bessel_y1_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_native.h new file mode 100644 index 0000000000000000000000000000000000000000..6ae1bd2ff006f91e5a11bc83191a8edb5e336bf6 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_bessel_y1_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_special_bessel_y1_out : public at::meta::structured_special_bessel_y1 { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_t_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_t_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d1141d3a8abedcf802d91b7217aca10fa5cc194c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_t_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_chebyshev_polynomial_t(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_t_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_t_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7306437121594c70e2fde4899234967af1e33d0d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_u_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +TORCH_API at::Tensor special_chebyshev_polynomial_u(const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_u_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..07990221652a8dd7d04aeeaeccbb0b0d5fc686a8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_v_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_v_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_v_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_v_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03a41b18f3ecfb3282a2a20fff83479809caf253 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Tensor & n); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..49107dc0f7e4bc4c03704f50baef352680f56a7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_entr(const at::Tensor & self); +TORCH_API at::Tensor & special_entr_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_entr_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..11fbb6d5b906d6e71551872129606f2de51f20d0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_entr_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_entr(const at::Tensor & self); +TORCH_API at::Tensor & special_entr_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_entr_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3c6c679e157750e614b39377672f2f0081a961c1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_gammaincc_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_gammaincc(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_gammaincc_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_gammaincc_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_i1_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_i1_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..e2346be8ec2dd2e682db2742eeec97aa3fd14a7f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_i1_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_special_i1 : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4f9228f15d5d19c9757d662e2dc9951510db0253 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_native.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_special_laguerre_polynomial_l_out : public at::meta::structured_special_laguerre_polynomial_l { +void impl(const at::Tensor & x, const at::Tensor & n, const at::Tensor & out); +}; +TORCH_API at::Tensor special_laguerre_polynomial_l(const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_laguerre_polynomial_l_out(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +TORCH_API at::Tensor special_laguerre_polynomial_l(const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_laguerre_polynomial_l_out(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0f05c765a28b7089369ac5312cc6795269e2d15d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_laguerre_polynomial_l { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_laguerre_polynomial_l"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_laguerre_polynomial_l(Tensor x, Tensor n) -> Tensor"; + static at::Tensor call(const at::Tensor & x, const at::Tensor & n); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n); +}; + +struct TORCH_API special_laguerre_polynomial_l_x_scalar { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_laguerre_polynomial_l"; + static constexpr const char* overload_name = "x_scalar"; + static constexpr const char* schema_str = "special_laguerre_polynomial_l.x_scalar(Scalar x, Tensor n) -> Tensor"; + static at::Tensor call(const at::Scalar & x, const at::Tensor & n); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n); +}; + +struct TORCH_API special_laguerre_polynomial_l_n_scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_laguerre_polynomial_l"; + static constexpr const char* overload_name = "n_scalar"; + static constexpr const char* schema_str = "special_laguerre_polynomial_l.n_scalar(Tensor x, Scalar n) -> Tensor"; + static at::Tensor call(const at::Tensor & x, const at::Scalar & n); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n); +}; + +struct TORCH_API special_laguerre_polynomial_l_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_laguerre_polynomial_l"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_laguerre_polynomial_l.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out); +}; + +struct TORCH_API special_laguerre_polynomial_l_x_scalar_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_laguerre_polynomial_l"; + static constexpr const char* overload_name = "x_scalar_out"; + static constexpr const char* schema_str = "special_laguerre_polynomial_l.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +}; + +struct TORCH_API special_laguerre_polynomial_l_n_scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_laguerre_polynomial_l"; + static constexpr const char* overload_name = "n_scalar_out"; + static constexpr const char* schema_str = "special_laguerre_polynomial_l.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_legendre_polynomial_p_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_legendre_polynomial_p_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7e46187f5d93dc54699a29e0fe94d99eea7397cc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_legendre_polynomial_p_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_legendre_polynomial_p(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_legendre_polynomial_p_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_legendre_polynomial_p_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_log1p_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_log1p_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..85ddea8829377fbc6c9516e52b232b25eee2a65f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_log1p_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_log1p(const at::Tensor & self); +TORCH_API at::Tensor & special_log1p_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_log1p_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_log_ndtr_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_log_ndtr_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0e20bab4ca47441d1c7b1c0ad0fe5d8afe42d625 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_log_ndtr_native.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_special_log_ndtr_out : public at::meta::structured_special_log_ndtr { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_logit_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_logit_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7f25a0b3d76c4011d8dcf04f47dceb3918671785 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_logit_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_logit(const at::Tensor & self, ::std::optional eps=::std::nullopt); +TORCH_API at::Tensor & special_logit_out(at::Tensor & out, const at::Tensor & self, ::std::optional eps=::std::nullopt); +TORCH_API at::Tensor & special_logit_outf(const at::Tensor & self, ::std::optional eps, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_logsumexp_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_logsumexp_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3396e7cadafb44949fe36b8230434d06566f0c0f --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_logsumexp_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_logsumexp(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & special_logsumexp_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & special_logsumexp_outf(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_i1_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_i1_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d78ac151c431cbe2d950494f28b652c1a3a4ed71 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_i1_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_modified_bessel_i1(const at::Tensor & self); +TORCH_API at::Tensor & special_modified_bessel_i1_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_modified_bessel_i1_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k1_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k1_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..3589a9525843b05f3f1ba7824314d22e376c8a74 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_modified_bessel_k1_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_special_modified_bessel_k1 : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f5d8465b1e53f2ba2cc91358a52a774c76c57c69 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_multigammaln_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_multigammaln(const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & special_multigammaln_out(at::Tensor & out, const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & special_multigammaln_outf(const at::Tensor & self, int64_t p, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma.h new file mode 100644 index 0000000000000000000000000000000000000000..a3fe18894736d9c27fdbff10e855da6fed286ce7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_polygamma(int n, Tensor self) -> Tensor +inline at::Tensor special_polygamma(int64_t n, const at::Tensor & self) { + return at::_ops::special_polygamma::call(n, self); +} + +// aten::special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_polygamma_out(at::Tensor & out, int64_t n, const at::Tensor & self) { + return at::_ops::special_polygamma_out::call(n, self, out); +} +// aten::special_polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_polygamma_outf(int64_t n, const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_polygamma_out::call(n, self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..553246cf44eb0d2f998d6b5d7e3ca97b03bf5bed --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_polygamma_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor special_polygamma(int64_t n, const at::Tensor & self); +TORCH_API at::Tensor & special_polygamma_out(at::Tensor & out, int64_t n, const at::Tensor & self); +TORCH_API at::Tensor & special_polygamma_outf(int64_t n, const at::Tensor & self, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi.h new file mode 100644 index 0000000000000000000000000000000000000000..6816ac1db3f471f0e726848df575f70a3587110c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_psi(Tensor self) -> Tensor +inline at::Tensor special_psi(const at::Tensor & self) { + return at::_ops::special_psi::call(self); +} + +// aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_psi_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_psi_out::call(self, out); +} +// aten::special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_psi_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_psi_out::call(self, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..05fa3669ca001feb84c0ef3341f7a866bb29e114 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_psi_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_psi { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_psi"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_psi(Tensor self) -> Tensor"; + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API special_psi_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_psi"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_psi.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cac399dc90e290186498a52c230bf8e0737ecaca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_scaled_modified_bessel_k1(const at::Tensor & x); +TORCH_API at::Tensor & special_scaled_modified_bessel_k1_out(at::Tensor & out, const at::Tensor & x); +TORCH_API at::Tensor & special_scaled_modified_bessel_k1_outf(const at::Tensor & x, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t.h new file mode 100644 index 0000000000000000000000000000000000000000..df138b2ccf38040229fa65f580b371a16e97e4f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_shifted_chebyshev_polynomial_t(Tensor x, Tensor n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_t(const at::Tensor & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_t.x_scalar(Scalar x, Tensor n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_t(const at::Scalar & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_t.n_scalar(Tensor x, Scalar n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_t(const at::Tensor & x, const at::Scalar & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_t.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_t_out::call(x, n, out); +} + +// aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_t.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_t_x_scalar_out::call(x, n, out); +} + +// aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n) { + return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_t.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_t_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_t_n_scalar_out::call(x, n, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_u_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_u_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..3074b5f1bdaacbfa7b75cac5c4c1aa812af6945e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_u_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_shifted_chebyshev_polynomial_u { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_shifted_chebyshev_polynomial_u"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_shifted_chebyshev_polynomial_u(Tensor x, Tensor n) -> Tensor"; + static at::Tensor call(const at::Tensor & x, const at::Tensor & n); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n); +}; + +struct TORCH_API special_shifted_chebyshev_polynomial_u_x_scalar { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_shifted_chebyshev_polynomial_u"; + static constexpr const char* overload_name = "x_scalar"; + static constexpr const char* schema_str = "special_shifted_chebyshev_polynomial_u.x_scalar(Scalar x, Tensor n) -> Tensor"; + static at::Tensor call(const at::Scalar & x, const at::Tensor & n); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n); +}; + +struct TORCH_API special_shifted_chebyshev_polynomial_u_n_scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_shifted_chebyshev_polynomial_u"; + static constexpr const char* overload_name = "n_scalar"; + static constexpr const char* schema_str = "special_shifted_chebyshev_polynomial_u.n_scalar(Tensor x, Scalar n) -> Tensor"; + static at::Tensor call(const at::Tensor & x, const at::Scalar & n); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n); +}; + +struct TORCH_API special_shifted_chebyshev_polynomial_u_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_shifted_chebyshev_polynomial_u"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_shifted_chebyshev_polynomial_u.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out); +}; + +struct TORCH_API special_shifted_chebyshev_polynomial_u_x_scalar_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_shifted_chebyshev_polynomial_u"; + static constexpr const char* overload_name = "x_scalar_out"; + static constexpr const char* schema_str = "special_shifted_chebyshev_polynomial_u.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +}; + +struct TORCH_API special_shifted_chebyshev_polynomial_u_n_scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_shifted_chebyshev_polynomial_u"; + static constexpr const char* overload_name = "n_scalar_out"; + static constexpr const char* schema_str = "special_shifted_chebyshev_polynomial_u.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v.h new file mode 100644 index 0000000000000000000000000000000000000000..68f82a4a0416ad38d166a368378fe7a68e3415ca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v.h @@ -0,0 +1,73 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_v(const at::Tensor & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_v::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_v(const at::Scalar & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_v_x_scalar::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor +inline at::Tensor special_shifted_chebyshev_polynomial_v(const at::Tensor & x, const at::Scalar & n) { + return at::_ops::special_shifted_chebyshev_polynomial_v_n_scalar::call(x, n); +} + +// aten::special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_v_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_v_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_v_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_v_out::call(x, n, out); +} + +// aten::special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_v_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n) { + return at::_ops::special_shifted_chebyshev_polynomial_v_x_scalar_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_v_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_v_x_scalar_out::call(x, n, out); +} + +// aten::special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_v_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n) { + return at::_ops::special_shifted_chebyshev_polynomial_v_n_scalar_out::call(x, n, out); +} +// aten::special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_shifted_chebyshev_polynomial_v_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out) { + return at::_ops::special_shifted_chebyshev_polynomial_v_n_scalar_out::call(x, n, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b98194b018c969158c80675e8ed57a2e29659a3a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor special_shifted_chebyshev_polynomial_v(const at::Tensor & x, const at::Tensor & n); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..73fec9a3ef2dd36d1b169a5ffa46107832b35fba --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_compositeexplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor special_xlog1py(const at::Scalar & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_out(at::Tensor & out, const at::Scalar & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_outf(const at::Scalar & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor special_xlog1py(const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & special_xlog1py_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other); +TORCH_API at::Tensor & special_xlog1py_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..85adf4c7eecf18f68748b2090b6b1f8da70fb200 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_xlog1py(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..ee990de61e862c9297ccfb8979c9790c6c28deb3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_special_xlog1py : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5ed9a26eb4258fec5217e08b17dd15f5b2c679f8 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_meta_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor special_xlog1py(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & special_xlog1py_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..566e816177a3e938c4dccc17c092bf57cb52f199 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_xlog1py_ops.h @@ -0,0 +1,89 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API special_xlog1py { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "special_xlog1py(Tensor self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other); +}; + +struct TORCH_API special_xlog1py_self_scalar { + using schema = at::Tensor (const at::Scalar &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "self_scalar"; + static constexpr const char* schema_str = "special_xlog1py.self_scalar(Scalar self, Tensor other) -> Tensor"; + static at::Tensor call(const at::Scalar & self, const at::Tensor & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other); +}; + +struct TORCH_API special_xlog1py_other_scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "other_scalar"; + static constexpr const char* schema_str = "special_xlog1py.other_scalar(Tensor self, Scalar other) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other); +}; + +struct TORCH_API special_xlog1py_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "special_xlog1py.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API special_xlog1py_self_scalar_out { + using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "self_scalar_out"; + static constexpr const char* schema_str = "special_xlog1py.self_scalar_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Scalar & self, const at::Tensor & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out); +}; + +struct TORCH_API special_xlog1py_other_scalar_out { + using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::special_xlog1py"; + static constexpr const char* overload_name = "other_scalar_out"; + static constexpr const char* schema_str = "special_xlog1py.other_scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_zeta_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_zeta_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..637255292131f1c728e495a3bbbee6250562c1bd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/special_zeta_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_special_zeta : public TensorIteratorBase { + + + void meta(const at::Tensor & self, const at::Tensor & other); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/split.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/split.h new file mode 100644 index 0000000000000000000000000000000000000000..35a3e1545056ec3f35eee4a8e0be1c3877bacf71 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/split.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split(const at::Tensor & self, int64_t split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, int64_t split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); + } +} + +// aten::split.Tensor(Tensor(a -> *) self, SymInt split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split_symint(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) { + return at::_ops::split_Tensor::call(self, split_size, dim); + } +} + +// aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split(const at::Tensor & self, at::IntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, c10::fromIntArrayRefSlow(split_size), dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, at::IntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, c10::fromIntArrayRefSlow(split_size), dim); + } +} + +// aten::split.sizes(Tensor(a -> *) self, SymInt[] split_size, int dim=0) -> Tensor(a)[] +inline ::std::vector split_symint(const at::Tensor & self, c10::SymIntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, split_size, dim); +} +namespace symint { + template >> + ::std::vector split(const at::Tensor & self, c10::SymIntArrayRef split_size, int64_t dim=0) { + return at::_ops::split_sizes::call(self, split_size, dim); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/split_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/split_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..55c053c44dd3248d6e6988cc10f44d602c1bfa0e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/split_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API split_copy_Tensor { + using schema = ::std::vector (const at::Tensor &, c10::SymInt, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::split_copy"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]"; + static ::std::vector call(const at::Tensor & self, c10::SymInt split_size, int64_t dim); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim); +}; + +struct TORCH_API split_copy_Tensor_out { + using schema = void (const at::Tensor &, c10::SymInt, int64_t, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::split_copy"; + static constexpr const char* overload_name = "Tensor_out"; + static constexpr const char* schema_str = "split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()"; + static void call(const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sqrt_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sqrt_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..733748593ffdf019a150d244b7b09ab1f2ed813d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sqrt_cpu_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor sqrt(const at::Tensor & self); +TORCH_API at::Tensor & sqrt_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & sqrt_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & sqrt_(at::Tensor & self); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/stack_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/stack_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..81c70f6e671966607fc1d1a3538458b2cf76c856 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/stack_compositeexplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor stack(at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & stack_out(at::Tensor & out, at::TensorList tensors, int64_t dim=0); +TORCH_API at::Tensor & stack_outf(at::TensorList tensors, int64_t dim, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/std_mean_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/std_mean_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..23d0e36e0dbb1a4b388ccff3db1e8317d60d1027 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/std_mean_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple std_mean(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, const ::std::optional & correction=::std::nullopt, bool keepdim=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/stride_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/stride_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..09f6647b2061ca50b0b45bf18dc08f7b009d00c0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/stride_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API stride_int { + using schema = int64_t (const at::Tensor &, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::stride"; + static constexpr const char* overload_name = "int"; + static constexpr const char* schema_str = "stride.int(Tensor self, int dim) -> int"; + static int64_t call(const at::Tensor & self, int64_t dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim); +}; + +struct TORCH_API stride_Dimname { + using schema = int64_t (const at::Tensor &, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::stride"; + static constexpr const char* overload_name = "Dimname"; + static constexpr const char* schema_str = "stride.Dimname(Tensor self, Dimname dim) -> int"; + static int64_t call(const at::Tensor & self, at::Dimname dim); + static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..53cdd0cfd4690bd431c7d27be698dd61a1589fb5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_compositeimplicitautograd_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor subtract(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & subtract_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & subtract_outf(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out); +TORCH_API at::Tensor & subtract_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor subtract(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1); +TORCH_API at::Tensor & subtract_(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ef3728b2d358e8865a1787372f7d23d26d91b27d --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/subtract_ops.h @@ -0,0 +1,78 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API subtract_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::subtract"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "subtract.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha, at::Tensor & out); +}; + +struct TORCH_API subtract_Tensor { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::subtract"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "subtract.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); +}; + +struct TORCH_API subtract__Tensor { + using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::subtract_"; + static constexpr const char* overload_name = "Tensor"; + static constexpr const char* schema_str = "subtract_.Tensor(Tensor(a!) self, Tensor other, *, Scalar alpha=1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); +}; + +struct TORCH_API subtract_Scalar { + using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::subtract"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "subtract.Scalar(Tensor self, Scalar other, Scalar alpha=1) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha); +}; + +struct TORCH_API subtract__Scalar { + using schema = at::Tensor & (at::Tensor &, const at::Scalar &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::subtract_"; + static constexpr const char* overload_name = "Scalar"; + static constexpr const char* schema_str = "subtract_.Scalar(Tensor(a!) self, Scalar other, Scalar alpha=1) -> Tensor(a!)"; + static at::Tensor & call(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sum_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sum_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cf130d7232d1ebd0b43f9f9a00ad0d87b1ecc52e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sum_compositeimplicitautograd_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor sum(const at::Tensor & self, at::DimnameList dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & sum_out(at::Tensor & out, const at::Tensor & self, at::DimnameList dim, bool keepdim=false, ::std::optional dtype=::std::nullopt); +TORCH_API at::Tensor & sum_outf(const at::Tensor & self, at::DimnameList dim, bool keepdim, ::std::optional dtype, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..eddc58e24bd97529ef1d792c6856e30f1444514e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/swapaxes_compositeimplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor swapaxes(const at::Tensor & self, int64_t axis0, int64_t axis1); +TORCH_API at::Tensor & swapaxes_(at::Tensor & self, int64_t axis0, int64_t axis1); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d1faa0034f3f4f32a9824dd16af45963727e5ac0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_constrain_range_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API void sym_constrain_range(const at::Scalar & size, ::std::optional min=::std::nullopt, ::std::optional max=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_is_contiguous.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_is_contiguous.h new file mode 100644 index 0000000000000000000000000000000000000000..d486f30f91ce497deda98ae5d2894d8c45bbc8fe --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/sym_is_contiguous.h @@ -0,0 +1,36 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool +inline c10::SymBool __dispatch_sym_is_contiguous(const at::Tensor & self, at::MemoryFormat memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::sym_is_contiguous::call(self, memory_format); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/t_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/t_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ff1e05e2ee15c52fef9487389663b301c6579d37 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/t_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & t_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor t_copy(const at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_along_dim_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_along_dim_native.h new file mode 100644 index 0000000000000000000000000000000000000000..de2eef0f5e82e32e73d62a6aa516afeaa0cbe667 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_along_dim_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor take_along_dim(const at::Tensor & self, const at::Tensor & indices, ::std::optional dim=::std::nullopt); +TORCH_API at::Tensor & take_along_dim_out(const at::Tensor & self, const at::Tensor & indices, ::std::optional dim, at::Tensor & out); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d9826e392d1d2bf15f0f5961d0f80704a6d54768 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor take(const at::Tensor & self, const at::Tensor & index); +TORCH_API at::Tensor & take_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & index); +TORCH_API at::Tensor & take_outf(const at::Tensor & self, const at::Tensor & index, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..5cb77898a4f0272b7abba27c0ae4fab9431e5f26 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/take_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API take_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::take"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "take.out(Tensor self, Tensor index, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, const at::Tensor & index, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index, at::Tensor & out); +}; + +struct TORCH_API take { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::take"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "take(Tensor self, Tensor index) -> Tensor"; + static at::Tensor call(const at::Tensor & self, const at::Tensor & index); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & index); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tan_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tan_native.h new file mode 100644 index 0000000000000000000000000000000000000000..18ca1873c08ae2f596b60ab1d2c6ad363def4016 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tan_native.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_tan_out : public at::meta::structured_tan { +void impl(const at::Tensor & self, const at::Tensor & out); +}; +TORCH_API at::Tensor tan_sparse(const at::Tensor & self); +TORCH_API at::Tensor & tan_sparse_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & tan_sparse_(at::Tensor & self); +TORCH_API at::Tensor tan_sparse_csr(const at::Tensor & self); +TORCH_API at::Tensor & tan_sparse_csr_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & tan_sparse_csr_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_backward_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7c225ae07c03c3942b8856ddc02567924f1c3238 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_backward_cpu_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor tanh_backward(const at::Tensor & grad_output, const at::Tensor & output); +TORCH_API at::Tensor & tanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & output); +TORCH_API at::Tensor & tanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & output, at::Tensor & grad_input); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..8113d3b72a96681decd6e61ad5681c6739629c5e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tanh_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_tanh_backward : public TensorIteratorBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & output); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..8850f544a7aa467758f690a523bdf1d6e8df21b3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_threshold_backward : public TensorIteratorBase { + + + void meta(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..51258c7b71c2c26dbd210f41bc2b41c846a2c529 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API threshold_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Scalar &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "threshold_backward.grad_input(Tensor grad_output, Tensor self, Scalar threshold, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold, at::Tensor & grad_input); +}; + +struct TORCH_API threshold_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::threshold_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "threshold_backward(Tensor grad_output, Tensor self, Scalar threshold) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..399a3df569824909413b5f958170fd7961131b05 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor threshold(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); +TORCH_API at::Tensor & threshold_(at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f1eccd3f56176414225e9f112d1dfb14f96ab9f2 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/threshold_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_threshold_out : public at::meta::structured_threshold { +void impl(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value, const at::Tensor & out); +}; +TORCH_API at::Tensor threshold_quantized_cpu(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_dense_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_dense_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7cea797e5bc47cf60201c984cb8dc19c8db42e2b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_dense_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API to_dense { + using schema = at::Tensor (const at::Tensor &, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::to_dense"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "to_dense(Tensor self, ScalarType? dtype=None, *, bool? masked_grad=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional dtype, ::std::optional masked_grad); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional dtype, ::std::optional masked_grad); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fb8881aa3474cb6a2bdb9f6333a3a299713c0791 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_backward_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor to_mkldnn_backward(const at::Tensor & grad, const at::Tensor & input); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ecf8dee17fba884039aee1ccd0fe921584b12969 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & to_mkldnn_out(const at::Tensor & self, ::std::optional dtype, at::Tensor & out); +TORCH_API at::Tensor dense_to_mkldnn(const at::Tensor & self, ::std::optional dtype=::std::nullopt); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..4fae366ca72e21162bf7caa4d4633256e7473466 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_mkldnn_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API to_mkldnn { + using schema = at::Tensor (const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::to_mkldnn"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, ::std::optional dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional dtype); +}; + +struct TORCH_API to_mkldnn_out { + using schema = at::Tensor & (const at::Tensor &, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::to_mkldnn"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "to_mkldnn.out(Tensor self, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, ::std::optional dtype, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional dtype, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_sparse_bsr_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_sparse_bsr_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..1e6effd9048bcdada1f677d0e40733fce74db9e3 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/to_sparse_bsr_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API to_sparse_bsr { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::to_sparse_bsr"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, ::std::optional dense_dim); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..af2d0b1a114cc94ea8644cad5d58b3325f302cdc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/topk_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_topk : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..b5b424a12bdfbd2d3c9f1ad52dca50aa0f25fe4e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor transpose(const at::Tensor & self, int64_t dim0, int64_t dim1); +TORCH_API at::Tensor & transpose_(at::Tensor & self, int64_t dim0, int64_t dim1); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..7b9265ab5011a28cdf985790ce05d03acb5b05e7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/transpose_copy_native.h @@ -0,0 +1,27 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & transpose_copy_int_out(const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out); +TORCH_API at::Tensor transpose_copy_int(const at::Tensor & self, int64_t dim0, int64_t dim1); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_indices_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_indices_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..700d2a825053107e5ca938c14a01af9ee35a0626 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_indices_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2ab2ea42973907e9ade1c88a00517446dd088a21 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/tril_meta_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor tril(const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor tril_symint(const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & tril_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & tril_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out); +TORCH_API at::Tensor & tril_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & tril_symint_outf(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +TORCH_API at::Tensor & tril_(at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & tril__symint(at::Tensor & self, c10::SymInt diagonal=0); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cc80c28740c4c453f529af484efec605a7b9235b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_cpu_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor triu(const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor triu_symint(const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & triu_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & triu_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out); +TORCH_API at::Tensor & triu_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & triu_symint_outf(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +TORCH_API at::Tensor & triu_(at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & triu__symint(at::Tensor & self, c10::SymInt diagonal=0); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices.h new file mode 100644 index 0000000000000000000000000000000000000000..5e05596a04796a6495d5fcb5f2e6bce50b356823 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices.h @@ -0,0 +1,49 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong) { + return at::_ops::triu_indices::call(row, col, offset, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::triu_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::triu_indices::call(row, col, offset, dtype, layout, device, pin_memory); +} + +// aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & triu_indices_out(at::Tensor & out, int64_t row, int64_t col, int64_t offset=0) { + return at::_ops::triu_indices_out::call(row, col, offset, out); +} +// aten::triu_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & triu_indices_outf(int64_t row, int64_t col, int64_t offset, at::Tensor & out) { + return at::_ops::triu_indices_out::call(row, col, offset, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f031051728e567736a6e5df1d6b5aaa95ac884d1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices_cpu_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a66a83d419a7dcb1f855615520bb381368a07ab5 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_indices_cuda_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..d7bba260799380cec394ff7681c92474382d912b --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/triu_meta_dispatch.h @@ -0,0 +1,35 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor triu(const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor triu_symint(const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & triu_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & triu_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out); +TORCH_API at::Tensor & triu_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymInt diagonal=0); +TORCH_API at::Tensor & triu_symint_outf(const at::Tensor & self, c10::SymInt diagonal, at::Tensor & out); +TORCH_API at::Tensor & triu_(at::Tensor & self, int64_t diagonal=0); +TORCH_API at::Tensor & triu__symint(at::Tensor & self, c10::SymInt diagonal=0); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..779fcb95ff255e3b457a9dc092a1f7468dcc0294 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unbind_copy_compositeexplicitautograd_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API void unbind_copy_out(at::TensorList out, const at::Tensor & self, int64_t dim=0); +TORCH_API void unbind_copy_outf(const at::Tensor & self, int64_t dim, at::TensorList out); + +} // namespace compositeexplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten.h new file mode 100644 index 0000000000000000000000000000000000000000..bc67eaf6d3710e3c6ae510da8c6211b84a8ae55a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten.h @@ -0,0 +1,75 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a) +inline at::Tensor unflatten(const at::Tensor & self, int64_t dim, at::IntArrayRef sizes) { + return at::_ops::unflatten_int::call(self, dim, c10::fromIntArrayRefSlow(sizes)); +} +namespace symint { + template >> + at::Tensor unflatten(const at::Tensor & self, int64_t dim, at::IntArrayRef sizes) { + return at::_ops::unflatten_int::call(self, dim, c10::fromIntArrayRefSlow(sizes)); + } +} + +// aten::unflatten.int(Tensor(a) self, int dim, SymInt[] sizes) -> Tensor(a) +inline at::Tensor unflatten_symint(const at::Tensor & self, int64_t dim, c10::SymIntArrayRef sizes) { + return at::_ops::unflatten_int::call(self, dim, sizes); +} +namespace symint { + template >> + at::Tensor unflatten(const at::Tensor & self, int64_t dim, c10::SymIntArrayRef sizes) { + return at::_ops::unflatten_int::call(self, dim, sizes); + } +} + +// aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a) +inline at::Tensor unflatten(const at::Tensor & self, at::Dimname dim, at::IntArrayRef sizes, at::DimnameList names) { + return at::_ops::unflatten_Dimname::call(self, dim, c10::fromIntArrayRefSlow(sizes), names); +} +namespace symint { + template >> + at::Tensor unflatten(const at::Tensor & self, at::Dimname dim, at::IntArrayRef sizes, at::DimnameList names) { + return at::_ops::unflatten_Dimname::call(self, dim, c10::fromIntArrayRefSlow(sizes), names); + } +} + +// aten::unflatten.Dimname(Tensor(a) self, Dimname dim, SymInt[] sizes, Dimname[] names) -> Tensor(a) +inline at::Tensor unflatten_symint(const at::Tensor & self, at::Dimname dim, c10::SymIntArrayRef sizes, at::DimnameList names) { + return at::_ops::unflatten_Dimname::call(self, dim, sizes, names); +} +namespace symint { + template >> + at::Tensor unflatten(const at::Tensor & self, at::Dimname dim, c10::SymIntArrayRef sizes, at::DimnameList names) { + return at::_ops::unflatten_Dimname::call(self, dim, sizes, names); + } +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..339572b2e78c99a3d1660f99949e06a802c3bd6c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unflatten_dense_tensors_ops.h @@ -0,0 +1,34 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API unflatten_dense_tensors { + using schema = ::std::vector (const at::Tensor &, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unflatten_dense_tensors"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "unflatten_dense_tensors(Tensor flat, Tensor[] tensors) -> Tensor[]"; + static ::std::vector call(const at::Tensor & flat, at::TensorList tensors); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & flat, at::TensorList tensors); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..1fc657d3cb32f3b0076ce218647537adc76c0f18 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor +inline at::Tensor unfold_copy(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) { + return at::_ops::unfold_copy::call(self, dimension, size, step); +} + +// aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & unfold_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) { + return at::_ops::unfold_copy_out::call(self, dimension, size, step, out); +} +// aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & unfold_copy_outf(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out) { + return at::_ops::unfold_copy_out::call(self, dimension, size, step, out); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cd90a697cf0a8f3c2db224a526bbfa6bae6dab02 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor unfold_copy(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d3c4d987b46af7af7676140a2478734bcf946269 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_copy_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API unfold_copy { + using schema = at::Tensor (const at::Tensor &, int64_t, int64_t, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unfold_copy"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor"; + static at::Tensor call(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); +}; + +struct TORCH_API unfold_copy_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, int64_t, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::unfold_copy"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d7173cea212c585ef3c2065739a5479a4ab037dd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/unfold_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..77982fb489f9cde882b4f40a719a6cd39164f9c4 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/uniform_meta_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor & uniform_(at::Tensor & self, double from=0, double to=1, ::std::optional generator=::std::nullopt); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bicubic2d_backward_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bicubic2d_backward_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..cffa012cdb346f2f1b4cc2a4f80e482905e21303 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bicubic2d_backward_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_upsample_bicubic2d_backward : public at::impl::MetaBase { + + + void meta(const at::Tensor & grad_output, at::ArrayRef output_size, at::ArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..131ced8a9a8ab26d6dbee953432b04f08eb93782 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_backward_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor upsample_bilinear2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_bilinear2d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); +TORCH_API at::Tensor & upsample_bilinear2d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c270c71d8bf4b094667812f687ac35881ac21efc --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_bilinear2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & upsample_bilinear2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_bilinear2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_linear1d_backward_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_linear1d_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..cda2cbde8c53a8f3703931c788acc5762e70b82c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_linear1d_backward_ops.h @@ -0,0 +1,45 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API upsample_linear1d_backward_grad_input { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::upsample_linear1d_backward"; + static constexpr const char* overload_name = "grad_input"; + static constexpr const char* schema_str = "upsample_linear1d_backward.grad_input(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None, *, Tensor(a!) grad_input) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales, at::Tensor & grad_input); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales, at::Tensor & grad_input); +}; + +struct TORCH_API upsample_linear1d_backward { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, bool, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::upsample_linear1d_backward"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor"; + static at::Tensor call(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, ::std::optional scales); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_linear1d_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_linear1d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6b44b64280b4695959ef444183859830cc10fdc7 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_linear1d_cpu_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor upsample_linear1d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor upsample_linear1d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & upsample_linear1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & upsample_linear1d_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional scales, at::Tensor & out); +TORCH_API at::Tensor & upsample_linear1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales=::std::nullopt); +TORCH_API at::Tensor & upsample_linear1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional scales, at::Tensor & out); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a3751c24239df36554b5562e559817eaac564e56 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_cuda_dispatch.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor upsample_nearest2d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor upsample_nearest2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest2d_outf(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & upsample_nearest2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h=::std::nullopt, ::std::optional scales_w=::std::nullopt); +TORCH_API at::Tensor & upsample_nearest2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..64c08a20386dd154ff42449e59c6a7a407b4d33e --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest2d_ops.h @@ -0,0 +1,67 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API upsample_nearest2d_vec { + using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, ::std::optional>); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::upsample_nearest2d"; + static constexpr const char* overload_name = "vec"; + static constexpr const char* schema_str = "upsample_nearest2d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor"; + static at::Tensor call(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors); +}; + +struct TORCH_API upsample_nearest2d_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, ::std::optional, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::upsample_nearest2d"; + static constexpr const char* overload_name = "out"; + static constexpr const char* schema_str = "upsample_nearest2d.out(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w, at::Tensor & out); +}; + +struct TORCH_API upsample_nearest2d { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, ::std::optional, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::upsample_nearest2d"; + static constexpr const char* overload_name = ""; + static constexpr const char* schema_str = "upsample_nearest2d(Tensor self, SymInt[2] output_size, float? scales_h=None, float? scales_w=None) -> Tensor"; + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, ::std::optional scales_h, ::std::optional scales_w); +}; + +struct TORCH_API upsample_nearest2d_vec_out { + using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, ::std::optional>, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + static constexpr const char* name = "aten::upsample_nearest2d"; + static constexpr const char* overload_name = "vec_out"; + static constexpr const char* schema_str = "upsample_nearest2d.vec_out(Tensor input, SymInt[]? output_size, float[]? scale_factors, *, Tensor(a!) out) -> Tensor(a!)"; + static at::Tensor & call(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, at::OptionalSymIntArrayRef output_size, ::std::optional> scale_factors, at::Tensor & out); +}; + +}} // namespace at::_ops + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..ba56efd75b79e00d71c900afd57e0f2b37548b38 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_nearest3d_meta.h @@ -0,0 +1,32 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_upsample_nearest3d : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, at::ArrayRef output_size, ::std::optional scales_d, ::std::optional scales_h, ::std::optional scales_w); +}; + +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/vander_compositeimplicitautograd_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/vander_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c02ca370e0cf59b5ee18194de4e8f09cee87f502 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/vander_compositeimplicitautograd_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor vander(const at::Tensor & x, ::std::optional N=::std::nullopt, bool increasing=false); + +} // namespace compositeimplicitautograd +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/var_mean_cpu_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/var_mean_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5fc5e6edaf5df0639b080986371cfabee8ee62fd --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/var_mean_cpu_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple var_mean(const at::Tensor & self, at::OptionalIntArrayRef dim=::std::nullopt, const ::std::optional & correction=::std::nullopt, bool keepdim=false); + +} // namespace cpu +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_complex_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_complex_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e339556c570e1dc2e5dffa89b011ad77f0ba49e1 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_complex_meta_dispatch.h @@ -0,0 +1,28 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor view_as_complex(const at::Tensor & self); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_native.h new file mode 100644 index 0000000000000000000000000000000000000000..1e6034883e3b059b201aeb2f1831c98518b0fe59 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_as_native.h @@ -0,0 +1,26 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor view_as(const at::Tensor & self, const at::Tensor & other); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_copy_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..f83ee680abe75ff7fc72c64ed81dc12f77f9342a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_copy_native.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & view_copy_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out); +TORCH_API at::Tensor view_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size); +TORCH_API at::Tensor & view_copy_dtype_out(const at::Tensor & self, at::ScalarType dtype, at::Tensor & out); +TORCH_API at::Tensor view_copy_dtype(const at::Tensor & self, at::ScalarType dtype); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cad667e2f0f57bd702a77c63632a7e67190ec6a0 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/view_meta_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor view(const at::Tensor & self, at::IntArrayRef size); +TORCH_API at::Tensor view_symint(const at::Tensor & self, c10::SymIntArrayRef size); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/where_cuda_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/where_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2cb48cb4c7c866d667250a19f8fbfe0f7235101c --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/where_cuda_dispatch.h @@ -0,0 +1,30 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor where(const at::Tensor & condition, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & where_out(at::Tensor & out, const at::Tensor & condition, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & where_outf(const at::Tensor & condition, const at::Tensor & self, const at::Tensor & other, at::Tensor & out); + +} // namespace cuda +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/xlogy_meta_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/xlogy_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..735be7dfd0441a52e2c5a78b67c241748666baca --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/xlogy_meta_dispatch.h @@ -0,0 +1,31 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor xlogy(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & xlogy_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & xlogy_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & xlogy_(at::Tensor & self, const at::Tensor & other); + +} // namespace meta +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/xor.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/xor.h new file mode 100644 index 0000000000000000000000000000000000000000..80dc5b3ba7e2906b77b74ecd03cd46db6dc894cb --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/xor.h @@ -0,0 +1,41 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::__xor__.Scalar(Tensor self, Scalar other) -> Tensor +inline at::Tensor __xor__(const at::Tensor & self, const at::Scalar & other) { + return at::_ops::__xor___Scalar::call(self, other); +} + +// aten::__xor__.Tensor(Tensor self, Tensor other) -> Tensor +inline at::Tensor __xor__(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::__xor___Tensor::call(self, other); +} + +} + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero_native.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero_native.h new file mode 100644 index 0000000000000000000000000000000000000000..388a6963d0ed77e0632b546e17fba69057ec4491 --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zero_native.h @@ -0,0 +1,33 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor zero(const at::Tensor & self); +TORCH_API at::Tensor & zero_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & zero_(at::Tensor & self); +TORCH_API at::Tensor & zero_meta_(at::Tensor & self); +TORCH_API at::Tensor & zero_nested_(at::Tensor & self); +TORCH_API at::Tensor & zero_sparse_(at::Tensor & self); +TORCH_API at::Tensor & zero_sparse_csr_(at::Tensor & self); +TORCH_API at::Tensor & mkldnn_zero_(at::Tensor & self); +} // namespace native +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a19709ee6e3fd4421f8914892c48832efc6c665a --- /dev/null +++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h @@ -0,0 +1,29 @@ +#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION) +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautogradnestedtensor { + +TORCH_API at::Tensor zeros_like(const at::Tensor & self, at::TensorOptions options={}, ::std::optional memory_format=::std::nullopt); +TORCH_API at::Tensor zeros_like(const at::Tensor & self, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format); + +} // namespace compositeimplicitautogradnestedtensor +} // namespace at + +#else +#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined." +#endif // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)