koichi12 commited on Feb 12, 2025

Commit

2fabb86

verified ·

1 Parent(s): f3e6968

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/torch/include/ATen/ArrayRef.h +2 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/DimVector.h +2 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/Dimname.h +1 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/Formatting.h +1 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/MetaFunctions_inl.h +325 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/PTThreadPool.h +17 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/SequenceNumber.h +13 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h +21 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalState.h +120 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/Version.h +18 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/AsmUtils.cuh +149 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAContextLight.h +99 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDADataType.h +105 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDADevice.h +23 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAUtils.h +20 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/NumericLimits.cuh +121 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/PeerToPeerAccess.h +11 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Sleep.h +13 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/cub_definitions.cuh +53 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h +58 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h +151 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh +124 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h +37 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h +11 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh +43 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh +116 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh +28 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/llvm_jit_strings.h +14 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h +397 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h +611 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h +275 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h +34 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/Tunable.h +246 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h +307 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h +286 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h +98 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h +49 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h +321 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/BinaryOps.h +119 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/ComplexHelper.h +97 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distance.h +20 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h +21 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/FractionalMaxPooling.h +80 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h +20 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSampler.h +298 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h +69 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/Math.h +0 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h +71 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h +27 -0
.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h +19 -0

.venv/lib/python3.11/site-packages/torch/include/ATen/ArrayRef.h ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #pragma once
2	+ #include <c10/util/ArrayRef.h>

.venv/lib/python3.11/site-packages/torch/include/ATen/DimVector.h ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #pragma once
2	+ #include <ATen/core/DimVector.h>

.venv/lib/python3.11/site-packages/torch/include/ATen/Dimname.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <ATen/core/Dimname.h>

.venv/lib/python3.11/site-packages/torch/include/ATen/Formatting.h ADDED Viewed

	@@ -0,0 +1 @@


1	+ #include <ATen/core/Formatting.h>

.venv/lib/python3.11/site-packages/torch/include/ATen/MetaFunctions_inl.h ADDED Viewed

	@@ -0,0 +1,325 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunctions_inl.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS)
+#error This change adds a dependency on all pytorch operators, meaning the     \
+  file will need to be re-compiled every time an operator is changed or added. \
+  Consider including a specific operator from                                  \
+  <ATen/ops/{my_operator}_meta_dispatch.h>.                   \
+  See NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS].
+#endif
+#include <ATen/ops/_add_relu_meta_dispatch.h>
+#include <ATen/ops/_addmm_activation_meta_dispatch.h>
+#include <ATen/ops/_amp_update_scale_meta_dispatch.h>
+#include <ATen/ops/_coalesced_meta_dispatch.h>
+#include <ATen/ops/_convert_indices_from_coo_to_csr_meta_dispatch.h>
+#include <ATen/ops/_convert_indices_from_csr_to_coo_meta_dispatch.h>
+#include <ATen/ops/_ctc_loss_meta_dispatch.h>
+#include <ATen/ops/_efficientzerotensor_meta_dispatch.h>
+#include <ATen/ops/_fill_mem_eff_dropout_mask_meta_dispatch.h>
+#include <ATen/ops/_fused_sdp_choice_meta_dispatch.h>
+#include <ATen/ops/_index_put_impl_meta_dispatch.h>
+#include <ATen/ops/_linalg_det_meta_dispatch.h>
+#include <ATen/ops/_linalg_eigh_meta_dispatch.h>
+#include <ATen/ops/_linalg_slogdet_meta_dispatch.h>
+#include <ATen/ops/_linalg_solve_ex_meta_dispatch.h>
+#include <ATen/ops/_linalg_svd_meta_dispatch.h>
+#include <ATen/ops/_log_softmax_meta_dispatch.h>
+#include <ATen/ops/_log_softmax_backward_data_meta_dispatch.h>
+#include <ATen/ops/_mkldnn_transpose_meta_dispatch.h>
+#include <ATen/ops/_reshape_alias_meta_dispatch.h>
+#include <ATen/ops/_resize_output_meta_dispatch.h>
+#include <ATen/ops/_softmax_meta_dispatch.h>
+#include <ATen/ops/_softmax_backward_data_meta_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_meta_dispatch.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors_meta_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_meta_dispatch.h>
+#include <ATen/ops/_upsample_bicubic2d_aa_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_meta_dispatch.h>
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact1d_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact2d_backward_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_meta_dispatch.h>
+#include <ATen/ops/_upsample_nearest_exact3d_backward_meta_dispatch.h>
+#include <ATen/ops/acos_meta_dispatch.h>
+#include <ATen/ops/acosh_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_meta_dispatch.h>
+#include <ATen/ops/adaptive_max_pool3d_backward_meta_dispatch.h>
+#include <ATen/ops/add_meta_dispatch.h>
+#include <ATen/ops/addbmm_meta_dispatch.h>
+#include <ATen/ops/addcdiv_meta_dispatch.h>
+#include <ATen/ops/addcmul_meta_dispatch.h>
+#include <ATen/ops/addmm_meta_dispatch.h>
+#include <ATen/ops/addmv_meta_dispatch.h>
+#include <ATen/ops/all_meta_dispatch.h>
+#include <ATen/ops/amax_meta_dispatch.h>
+#include <ATen/ops/amin_meta_dispatch.h>
+#include <ATen/ops/aminmax_meta_dispatch.h>
+#include <ATen/ops/any_meta_dispatch.h>
+#include <ATen/ops/arange_meta_dispatch.h>
+#include <ATen/ops/argmax_meta_dispatch.h>
+#include <ATen/ops/argmin_meta_dispatch.h>
+#include <ATen/ops/as_strided_meta_dispatch.h>
+#include <ATen/ops/asin_meta_dispatch.h>
+#include <ATen/ops/asinh_meta_dispatch.h>
+#include <ATen/ops/atan_meta_dispatch.h>
+#include <ATen/ops/atan2_meta_dispatch.h>
+#include <ATen/ops/atanh_meta_dispatch.h>
+#include <ATen/ops/avg_pool2d_meta_dispatch.h>
+#include <ATen/ops/avg_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/avg_pool3d_meta_dispatch.h>
+#include <ATen/ops/avg_pool3d_backward_meta_dispatch.h>
+#include <ATen/ops/baddbmm_meta_dispatch.h>
+#include <ATen/ops/bernoulli_meta_dispatch.h>
+#include <ATen/ops/bitwise_and_meta_dispatch.h>
+#include <ATen/ops/bitwise_left_shift_meta_dispatch.h>
+#include <ATen/ops/bitwise_not_meta_dispatch.h>
+#include <ATen/ops/bitwise_or_meta_dispatch.h>
+#include <ATen/ops/bitwise_right_shift_meta_dispatch.h>
+#include <ATen/ops/bitwise_xor_meta_dispatch.h>
+#include <ATen/ops/bmm_meta_dispatch.h>
+#include <ATen/ops/cat_meta_dispatch.h>
+#include <ATen/ops/cauchy_meta_dispatch.h>
+#include <ATen/ops/ceil_meta_dispatch.h>
+#include <ATen/ops/clamp_meta_dispatch.h>
+#include <ATen/ops/clamp_max_meta_dispatch.h>
+#include <ATen/ops/clamp_min_meta_dispatch.h>
+#include <ATen/ops/copy_meta_dispatch.h>
+#include <ATen/ops/copy_sparse_to_sparse_meta_dispatch.h>
+#include <ATen/ops/copysign_meta_dispatch.h>
+#include <ATen/ops/cos_meta_dispatch.h>
+#include <ATen/ops/cosh_meta_dispatch.h>
+#include <ATen/ops/cumprod_meta_dispatch.h>
+#include <ATen/ops/cumsum_meta_dispatch.h>
+#include <ATen/ops/digamma_meta_dispatch.h>
+#include <ATen/ops/div_meta_dispatch.h>
+#include <ATen/ops/elu_meta_dispatch.h>
+#include <ATen/ops/elu_backward_meta_dispatch.h>
+#include <ATen/ops/embedding_renorm_meta_dispatch.h>
+#include <ATen/ops/empty_meta_dispatch.h>
+#include <ATen/ops/empty_strided_meta_dispatch.h>
+#include <ATen/ops/eq_meta_dispatch.h>
+#include <ATen/ops/erf_meta_dispatch.h>
+#include <ATen/ops/erfc_meta_dispatch.h>
+#include <ATen/ops/erfinv_meta_dispatch.h>
+#include <ATen/ops/exp_meta_dispatch.h>
+#include <ATen/ops/exp2_meta_dispatch.h>
+#include <ATen/ops/expm1_meta_dispatch.h>
+#include <ATen/ops/exponential_meta_dispatch.h>
+#include <ATen/ops/eye_meta_dispatch.h>
+#include <ATen/ops/fill_meta_dispatch.h>
+#include <ATen/ops/floor_meta_dispatch.h>
+#include <ATen/ops/floor_divide_meta_dispatch.h>
+#include <ATen/ops/fmax_meta_dispatch.h>
+#include <ATen/ops/fmin_meta_dispatch.h>
+#include <ATen/ops/fmod_meta_dispatch.h>
+#include <ATen/ops/frac_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool2d_backward_meta_dispatch.h>
+#include <ATen/ops/fractional_max_pool3d_meta_dispatch.h>
+#include <ATen/ops/gather_meta_dispatch.h>
+#include <ATen/ops/gcd_meta_dispatch.h>
+#include <ATen/ops/ge_meta_dispatch.h>
+#include <ATen/ops/gelu_meta_dispatch.h>
+#include <ATen/ops/gelu_backward_meta_dispatch.h>
+#include <ATen/ops/geometric_meta_dispatch.h>
+#include <ATen/ops/glu_meta_dispatch.h>
+#include <ATen/ops/gt_meta_dispatch.h>
+#include <ATen/ops/hardshrink_meta_dispatch.h>
+#include <ATen/ops/hardshrink_backward_meta_dispatch.h>
+#include <ATen/ops/hardsigmoid_meta_dispatch.h>
+#include <ATen/ops/hardsigmoid_backward_meta_dispatch.h>
+#include <ATen/ops/hardswish_meta_dispatch.h>
+#include <ATen/ops/hardtanh_meta_dispatch.h>
+#include <ATen/ops/heaviside_meta_dispatch.h>
+#include <ATen/ops/hypot_meta_dispatch.h>
+#include <ATen/ops/i0_meta_dispatch.h>
+#include <ATen/ops/igamma_meta_dispatch.h>
+#include <ATen/ops/igammac_meta_dispatch.h>
+#include <ATen/ops/index_meta_dispatch.h>
+#include <ATen/ops/index_add_meta_dispatch.h>
+#include <ATen/ops/index_copy_meta_dispatch.h>
+#include <ATen/ops/index_fill_meta_dispatch.h>
+#include <ATen/ops/index_reduce_meta_dispatch.h>
+#include <ATen/ops/isin_meta_dispatch.h>
+#include <ATen/ops/isneginf_meta_dispatch.h>
+#include <ATen/ops/isposinf_meta_dispatch.h>
+#include <ATen/ops/lcm_meta_dispatch.h>
+#include <ATen/ops/le_meta_dispatch.h>
+#include <ATen/ops/leaky_relu_meta_dispatch.h>
+#include <ATen/ops/leaky_relu_backward_meta_dispatch.h>
+#include <ATen/ops/lerp_meta_dispatch.h>
+#include <ATen/ops/lgamma_meta_dispatch.h>
+#include <ATen/ops/linalg_cholesky_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_cross_meta_dispatch.h>
+#include <ATen/ops/linalg_inv_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_ldl_factor_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_ldl_solve_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_factor_ex_meta_dispatch.h>
+#include <ATen/ops/linalg_lu_solve_meta_dispatch.h>
+#include <ATen/ops/linalg_qr_meta_dispatch.h>
+#include <ATen/ops/linalg_vector_norm_meta_dispatch.h>
+#include <ATen/ops/linspace_meta_dispatch.h>
+#include <ATen/ops/log_meta_dispatch.h>
+#include <ATen/ops/log10_meta_dispatch.h>
+#include <ATen/ops/log1p_meta_dispatch.h>
+#include <ATen/ops/log2_meta_dispatch.h>
+#include <ATen/ops/log_normal_meta_dispatch.h>
+#include <ATen/ops/logaddexp_meta_dispatch.h>
+#include <ATen/ops/logaddexp2_meta_dispatch.h>
+#include <ATen/ops/logit_meta_dispatch.h>
+#include <ATen/ops/logit_backward_meta_dispatch.h>
+#include <ATen/ops/logspace_meta_dispatch.h>
+#include <ATen/ops/lshift_meta_dispatch.h>
+#include <ATen/ops/lt_meta_dispatch.h>
+#include <ATen/ops/lu_unpack_meta_dispatch.h>
+#include <ATen/ops/masked_fill_meta_dispatch.h>
+#include <ATen/ops/masked_scatter_meta_dispatch.h>
+#include <ATen/ops/max_meta_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_meta_dispatch.h>
+#include <ATen/ops/max_pool2d_with_indices_backward_meta_dispatch.h>
+#include <ATen/ops/maximum_meta_dispatch.h>
+#include <ATen/ops/mean_meta_dispatch.h>
+#include <ATen/ops/min_meta_dispatch.h>
+#include <ATen/ops/minimum_meta_dispatch.h>
+#include <ATen/ops/mish_meta_dispatch.h>
+#include <ATen/ops/mm_meta_dispatch.h>
+#include <ATen/ops/mse_loss_meta_dispatch.h>
+#include <ATen/ops/mul_meta_dispatch.h>
+#include <ATen/ops/ne_meta_dispatch.h>
+#include <ATen/ops/neg_meta_dispatch.h>
+#include <ATen/ops/nextafter_meta_dispatch.h>
+#include <ATen/ops/nll_loss_backward_meta_dispatch.h>
+#include <ATen/ops/nll_loss_forward_meta_dispatch.h>
+#include <ATen/ops/norm_meta_dispatch.h>
+#include <ATen/ops/normal_meta_dispatch.h>
+#include <ATen/ops/polygamma_meta_dispatch.h>
+#include <ATen/ops/pow_meta_dispatch.h>
+#include <ATen/ops/prod_meta_dispatch.h>
+#include <ATen/ops/put_meta_dispatch.h>
+#include <ATen/ops/random_meta_dispatch.h>
+#include <ATen/ops/range_meta_dispatch.h>
+#include <ATen/ops/reciprocal_meta_dispatch.h>
+#include <ATen/ops/reflection_pad1d_meta_dispatch.h>
+#include <ATen/ops/reflection_pad1d_backward_meta_dispatch.h>
+#include <ATen/ops/reflection_pad3d_meta_dispatch.h>
+#include <ATen/ops/reflection_pad3d_backward_meta_dispatch.h>
+#include <ATen/ops/relu_meta_dispatch.h>
+#include <ATen/ops/remainder_meta_dispatch.h>
+#include <ATen/ops/renorm_meta_dispatch.h>
+#include <ATen/ops/replication_pad1d_meta_dispatch.h>
+#include <ATen/ops/replication_pad1d_backward_meta_dispatch.h>
+#include <ATen/ops/replication_pad2d_meta_dispatch.h>
+#include <ATen/ops/replication_pad3d_meta_dispatch.h>
+#include <ATen/ops/resize_meta_dispatch.h>
+#include <ATen/ops/resize_as_sparse_meta_dispatch.h>
+#include <ATen/ops/round_meta_dispatch.h>
+#include <ATen/ops/rrelu_with_noise_meta_dispatch.h>
+#include <ATen/ops/rshift_meta_dispatch.h>
+#include <ATen/ops/rsqrt_meta_dispatch.h>
+#include <ATen/ops/scatter_meta_dispatch.h>
+#include <ATen/ops/scatter_add_meta_dispatch.h>
+#include <ATen/ops/scatter_reduce_meta_dispatch.h>
+#include <ATen/ops/set_meta_dispatch.h>
+#include <ATen/ops/sgn_meta_dispatch.h>
+#include <ATen/ops/sigmoid_meta_dispatch.h>
+#include <ATen/ops/sigmoid_backward_meta_dispatch.h>
+#include <ATen/ops/sign_meta_dispatch.h>
+#include <ATen/ops/signbit_meta_dispatch.h>
+#include <ATen/ops/silu_meta_dispatch.h>
+#include <ATen/ops/silu_backward_meta_dispatch.h>
+#include <ATen/ops/sin_meta_dispatch.h>
+#include <ATen/ops/sinc_meta_dispatch.h>
+#include <ATen/ops/sinh_meta_dispatch.h>
+#include <ATen/ops/slow_conv_transpose2d_meta_dispatch.h>
+#include <ATen/ops/smooth_l1_loss_meta_dispatch.h>
+#include <ATen/ops/softplus_meta_dispatch.h>
+#include <ATen/ops/softplus_backward_meta_dispatch.h>
+#include <ATen/ops/softshrink_meta_dispatch.h>
+#include <ATen/ops/softshrink_backward_meta_dispatch.h>
+#include <ATen/ops/sort_meta_dispatch.h>
+#include <ATen/ops/sparse_resize_meta_dispatch.h>
+#include <ATen/ops/sparse_resize_and_clear_meta_dispatch.h>
+#include <ATen/ops/special_airy_ai_meta_dispatch.h>
+#include <ATen/ops/special_bessel_j0_meta_dispatch.h>
+#include <ATen/ops/special_bessel_j1_meta_dispatch.h>
+#include <ATen/ops/special_bessel_y0_meta_dispatch.h>
+#include <ATen/ops/special_bessel_y1_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_t_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_u_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_v_meta_dispatch.h>
+#include <ATen/ops/special_chebyshev_polynomial_w_meta_dispatch.h>
+#include <ATen/ops/special_entr_meta_dispatch.h>
+#include <ATen/ops/special_erfcx_meta_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_h_meta_dispatch.h>
+#include <ATen/ops/special_hermite_polynomial_he_meta_dispatch.h>
+#include <ATen/ops/special_i0e_meta_dispatch.h>
+#include <ATen/ops/special_i1_meta_dispatch.h>
+#include <ATen/ops/special_i1e_meta_dispatch.h>
+#include <ATen/ops/special_laguerre_polynomial_l_meta_dispatch.h>
+#include <ATen/ops/special_legendre_polynomial_p_meta_dispatch.h>
+#include <ATen/ops/special_log_ndtr_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i0_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_i1_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k0_meta_dispatch.h>
+#include <ATen/ops/special_modified_bessel_k1_meta_dispatch.h>
+#include <ATen/ops/special_ndtri_meta_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k0_meta_dispatch.h>
+#include <ATen/ops/special_scaled_modified_bessel_k1_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_t_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_u_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_v_meta_dispatch.h>
+#include <ATen/ops/special_shifted_chebyshev_polynomial_w_meta_dispatch.h>
+#include <ATen/ops/special_spherical_bessel_j0_meta_dispatch.h>
+#include <ATen/ops/special_xlog1py_meta_dispatch.h>
+#include <ATen/ops/special_zeta_meta_dispatch.h>
+#include <ATen/ops/sqrt_meta_dispatch.h>
+#include <ATen/ops/sub_meta_dispatch.h>
+#include <ATen/ops/sum_meta_dispatch.h>
+#include <ATen/ops/tan_meta_dispatch.h>
+#include <ATen/ops/tanh_meta_dispatch.h>
+#include <ATen/ops/tanh_backward_meta_dispatch.h>
+#include <ATen/ops/threshold_meta_dispatch.h>
+#include <ATen/ops/threshold_backward_meta_dispatch.h>
+#include <ATen/ops/topk_meta_dispatch.h>
+#include <ATen/ops/triangular_solve_meta_dispatch.h>
+#include <ATen/ops/tril_meta_dispatch.h>
+#include <ATen/ops/triu_meta_dispatch.h>
+#include <ATen/ops/trunc_meta_dispatch.h>
+#include <ATen/ops/unfold_meta_dispatch.h>
+#include <ATen/ops/uniform_meta_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_meta_dispatch.h>
+#include <ATen/ops/upsample_bicubic2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_meta_dispatch.h>
+#include <ATen/ops/upsample_bilinear2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_linear1d_meta_dispatch.h>
+#include <ATen/ops/upsample_linear1d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest1d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest2d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_meta_dispatch.h>
+#include <ATen/ops/upsample_nearest3d_backward_meta_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_meta_dispatch.h>
+#include <ATen/ops/upsample_trilinear3d_backward_meta_dispatch.h>
+#include <ATen/ops/view_meta_dispatch.h>
+#include <ATen/ops/view_as_complex_meta_dispatch.h>
+#include <ATen/ops/view_as_real_meta_dispatch.h>
+#include <ATen/ops/xlogy_meta_dispatch.h>
+#include <ATen/ops/zero_meta_dispatch.h>

.venv/lib/python3.11/site-packages/torch/include/ATen/PTThreadPool.h ADDED Viewed

	@@ -0,0 +1,17 @@

+#pragma once
+#include <ATen/Parallel.h>
+#include <c10/core/thread_pool.h>
+namespace at {
+class TORCH_API PTThreadPool : public c10::ThreadPool {
+ public:
+  explicit PTThreadPool(int pool_size, int numa_node_id = -1)
+      : c10::ThreadPool(pool_size, numa_node_id, []() {
+          c10::setThreadName("PTThreadPool");
+          at::init_num_threads();
+        }) {}
+};
+} // namespace at

.venv/lib/python3.11/site-packages/torch/include/ATen/SequenceNumber.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <c10/macros/Export.h>
+#include <cstdint>
+// A simple thread local enumeration, used to link forward and backward pass
+// ops and is used by autograd and observers framework
+namespace at::sequence_number {
+TORCH_API uint64_t peek();
+TORCH_API uint64_t get_and_increment();
+} // namespace at::sequence_number

.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+#include <c10/core/SafePyObject.h>
+#include <c10/macros/Macros.h>
+#include <unordered_map>
+namespace at::impl {
+struct TORCH_API ThreadLocalPythonObjects {
+  static void set(const std::string& key, std::shared_ptr<SafePyObject> value);
+  static const std::shared_ptr<SafePyObject>& get(const std::string& key);
+  static bool contains(const std::string& key);
+  static const ThreadLocalPythonObjects& get_state();
+  static void set_state(ThreadLocalPythonObjects state);
+ private:
+  std::unordered_map<std::string, std::shared_ptr<c10::SafePyObject>> obj_dict_;
+};
+} // namespace at::impl

.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalState.h ADDED Viewed

	@@ -0,0 +1,120 @@

+#pragma once
+#include <c10/core/InferenceMode.h>
+#include <c10/core/impl/LocalDispatchKeySet.h>
+#include <c10/util/Exception.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <ATen/FuncTorchTLS.h>
+#include <ATen/PythonTorchFunctionTLS.h>
+#include <ATen/SavedTensorHooks.h>
+#include <ATen/ThreadLocalPythonObjects.h>
+#include <ATen/record_function.h>
+#include <c10/core/impl/PythonDispatcherTLS.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+namespace at {
+// Thread local state contains values that are preserved across
+// thread boundaries (e.g. at::launch/JIT fork, autograd).
+// Note at::parallel_for doesn't preserve TLS across thread boundaries.
+class TORCH_API ThreadLocalState {
+ public:
+  // Saves the thread local variables' values and
+  // returns them as a ThreadLocalState
+  ThreadLocalState();
+  // set_grad_mode - force the value of the grad mode TLS in
+  //  the current state object. This is used for example in the
+  //  autograd engine.
+  void set_grad_mode(bool enabled);
+  // set_multithreading_enabled - force the value of the multithreadinmaximum
+  // threads TLS in
+  //  the current state object. This is used for example in the
+  //  autograd engine.
+  void set_multithreading_enabled(bool enabled);
+  // Sets thread local variables in the current thread,
+  // according to the thread boundary specified
+  static void setThreadLocalState(const ThreadLocalState& state);
+ private:
+  c10::impl::LocalDispatchKeySet dispatch_key_;
+  // ThreadLocalDebugInfo does not change after being created
+  // with DebugInfoGuard
+  std::shared_ptr<c10::ThreadLocalDebugInfo> debug_info_;
+  // RecordFunction TLS
+  RecordFunctionTLS rf_tls_;
+  // TLS for out-of-tree functorch
+  // See NOTE [functorch TLS in pytorch/pytorch] for why this needs to be a
+  // pointer (spoiler alert: it's due to the indirection)
+  // This needs to be a shared_ptr instead of a unique_ptr because
+  // ThreadLocalState is copy-able and does indeed get copied. Maybe we can
+  // consider adding an explicit copy constructor for ThreadLocalState in the
+  // future but I didn't want to add one just for this.
+  std::shared_ptr<const functorch::FuncTorchTLSBase> functorch_tls_;
+  // TLS for AutogradModes
+  AutogradState autograd_tls_;
+  // TLS for enable_torch_dispatch_mode
+  c10::impl::TorchDispatchModeTLS torch_dispatch_mode_state_;
+  // TLS for enable_python_dispatcher
+  c10::impl::PyInterpreter* python_dispatcher_state_;
+  // TLS for __torch_function__ (mode and disable_torch_function)
+  at::impl::PythonTorchFunctionTLS python_torch_function_state_;
+  // TLS for saved tensors default hooks
+  at::impl::SavedTensorDefaultHooksTLS saved_tensors_default_hooks_state_;
+  bool functionalization_reapply_views_state_;
+  // TLS for arbitrary python objects that is registered via hooks
+  at::impl::ThreadLocalPythonObjects saved_objects_;
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE) && \
+    !defined(BUILD_LITE_INTERPRETER)
+  // TLS for autocast dtypes
+  std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+      autocast_dtypes_;
+#endif
+  friend class ThreadLocalStateGuard;
+};
+// Guard to set and reset the thread local state
+class TORCH_API ThreadLocalStateGuard {
+ public:
+  explicit ThreadLocalStateGuard(const ThreadLocalState& state)
+      : prev_state_(ThreadLocalState()) {
+    // set the given state across the thread boundary
+    ThreadLocalState::setThreadLocalState(state);
+  }
+  ~ThreadLocalStateGuard() {
+    // restore previously set variables
+    ThreadLocalState::setThreadLocalState(prev_state_);
+  }
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ThreadLocalState prev_state_;
+};
+template <typename T>
+auto wrapPropagateTLSState(T callback) {
+  return [tls_state = ThreadLocalState(),
+          callback = std::move(callback)](auto&&... args) {
+    ThreadLocalStateGuard g(tls_state);
+    // Propagate value returned by callback().
+    return callback(std::forward<decltype(args)>(args)...);
+  };
+}
+} // namespace at

.venv/lib/python3.11/site-packages/torch/include/ATen/Version.h ADDED Viewed

	@@ -0,0 +1,18 @@

+#include <ATen/Context.h>
+namespace at {
+/// Returns a detailed string describing the configuration PyTorch.
+TORCH_API std::string show_config();
+TORCH_API std::string get_mkl_version();
+TORCH_API std::string get_mkldnn_version();
+TORCH_API std::string get_openmp_version();
+TORCH_API std::string get_cxx_flags();
+TORCH_API std::string get_cpu_capability();
+} // namespace at

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/AsmUtils.cuh ADDED Viewed

	@@ -0,0 +1,149 @@

+#pragma once
+#include <cstdint>
+// Collection of direct PTX functions
+namespace at::cuda {
+template <typename T>
+struct Bitfield {};
+template <>
+struct Bitfield<unsigned int> {
+  static __device__ __host__ __forceinline__
+  unsigned int getBitfield(unsigned int val, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+    unsigned int m = (1u << len) - 1u;
+    return (val >> pos) & m;
+#else
+    unsigned int ret;
+    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+  static __device__ __host__ __forceinline__
+  unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+    unsigned int m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+    return (val & ~m) | toInsert;
+#else
+    unsigned int ret;
+    asm("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+};
+template <>
+struct Bitfield<uint64_t> {
+  static __device__ __host__ __forceinline__
+  uint64_t getBitfield(uint64_t val, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+    uint64_t m = (1u << len) - 1u;
+    return (val >> pos) & m;
+#else
+    uint64_t ret;
+    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+  static __device__ __host__ __forceinline__
+  uint64_t setBitfield(uint64_t val, uint64_t toInsert, int pos, int len) {
+#if !defined(__CUDA_ARCH__)
+    pos &= 0xff;
+    len &= 0xff;
+    uint64_t m = (1u << len) - 1u;
+    toInsert &= m;
+    toInsert <<= pos;
+    m <<= pos;
+    return (val & ~m) | toInsert;
+#else
+    uint64_t ret;
+    asm("bfi.b64 %0, %1, %2, %3, %4;" :
+        "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len));
+    return ret;
+#endif
+  }
+};
+__device__ __forceinline__ int getLaneId() {
+#if defined(USE_ROCM)
+  return __lane_id();
+#else
+  int laneId;
+  asm("mov.s32 %0, %%laneid;" : "=r"(laneId) );
+  return laneId;
+#endif
+}
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskLt() {
+  const std::uint64_t m = (1ull << getLaneId()) - 1ull;
+  return m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskLt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask));
+  return mask;
+}
+#endif
+#if defined (USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskLe() {
+  std::uint64_t m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1));
+  return m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskLe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
+  return mask;
+}
+#endif
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskGt() {
+  const std::uint64_t m = getLaneMaskLe();
+  return m ? ~m : m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskGt() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask));
+  return mask;
+}
+#endif
+#if defined(USE_ROCM)
+__device__ __forceinline__ unsigned long long int getLaneMaskGe() {
+  const std::uint64_t m = getLaneMaskLt();
+  return ~m;
+}
+#else
+__device__ __forceinline__ unsigned getLaneMaskGe() {
+  unsigned mask;
+  asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask));
+  return mask;
+}
+#endif
+} // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAContextLight.h ADDED Viewed

	@@ -0,0 +1,99 @@

+#pragma once
+// Light-weight version of CUDAContext.h with fewer transitive includes
+#include <cstdint>
+#include <cuda_runtime_api.h>
+#include <cusparse.h>
+#include <cublas_v2.h>
+// cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also
+// added bf16 support
+#include <cublasLt.h>
+#ifdef CUDART_VERSION
+#include <cusolverDn.h>
+#endif
+#if defined(USE_CUDSS)
+#include <cudss.h>
+#endif
+#if defined(USE_ROCM)
+#include <hipsolver/hipsolver.h>
+#endif
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAFunctions.h>
+namespace c10 {
+struct Allocator;
+}
+namespace at::cuda {
+/*
+A common CUDA interface for ATen.
+This interface is distinct from CUDAHooks, which defines an interface that links
+to both CPU-only and CUDA builds. That interface is intended for runtime
+dispatch and should be used from files that are included in both CPU-only and
+CUDA builds.
+CUDAContext, on the other hand, should be preferred by files only included in
+CUDA builds. It is intended to expose CUDA functionality in a consistent
+manner.
+This means there is some overlap between the CUDAContext and CUDAHooks, but
+the choice of which to use is simple: use CUDAContext when in a CUDA-only file,
+use CUDAHooks otherwise.
+Note that CUDAContext simply defines an interface with no associated class.
+It is expected that the modules whose functions compose this interface will
+manage their own state. There is only a single CUDA context/state.
+*/
+/**
+ * DEPRECATED: use device_count() instead
+ */
+inline int64_t getNumGPUs() {
+    return c10::cuda::device_count();
+}
+/**
+ * CUDA is available if we compiled with CUDA, and there are one or more
+ * devices.  If we compiled with CUDA but there is a driver problem, etc.,
+ * this function will report CUDA is not available (rather than raise an error.)
+ */
+inline bool is_available() {
+    return c10::cuda::device_count() > 0;
+}
+TORCH_CUDA_CPP_API cudaDeviceProp* getCurrentDeviceProperties();
+TORCH_CUDA_CPP_API int warp_size();
+TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device);
+TORCH_CUDA_CPP_API bool canDeviceAccessPeer(
+    c10::DeviceIndex device,
+    c10::DeviceIndex peer_device);
+TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator();
+/* Handles */
+TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle();
+TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
+TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
+TORCH_CUDA_CPP_API void clearCublasWorkspaces();
+#if defined(CUDART_VERSION) || defined(USE_ROCM)
+TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
+#endif
+#if defined(USE_CUDSS)
+TORCH_CUDA_CPP_API cudssHandle_t getCurrentCudssHandle();
+#endif
+} // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDADataType.h ADDED Viewed

	@@ -0,0 +1,105 @@

+#pragma once
+#include <c10/core/ScalarType.h>
+#include <cuda.h>
+#include <library_types.h>
+namespace at::cuda {
+template <typename scalar_t>
+cudaDataType getCudaDataType() {
+  static_assert(false && sizeof(scalar_t), "Cannot convert type to cudaDataType.");
+  return {};
+}
+template<> inline cudaDataType getCudaDataType<at::Half>() {
+  return CUDA_R_16F;
+}
+template<> inline cudaDataType getCudaDataType<float>() {
+  return CUDA_R_32F;
+}
+template<> inline cudaDataType getCudaDataType<double>() {
+  return CUDA_R_64F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<c10::Half>>() {
+  return CUDA_C_16F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<float>>() {
+  return CUDA_C_32F;
+}
+template<> inline cudaDataType getCudaDataType<c10::complex<double>>() {
+  return CUDA_C_64F;
+}
+template<> inline cudaDataType getCudaDataType<uint8_t>() {
+  return CUDA_R_8U;
+}
+template<> inline cudaDataType getCudaDataType<int8_t>() {
+  return CUDA_R_8I;
+}
+template<> inline cudaDataType getCudaDataType<int>() {
+  return CUDA_R_32I;
+}
+template<> inline cudaDataType getCudaDataType<int16_t>() {
+  return CUDA_R_16I;
+}
+template<> inline cudaDataType getCudaDataType<int64_t>() {
+  return CUDA_R_64I;
+}
+template<> inline cudaDataType getCudaDataType<at::BFloat16>() {
+  return CUDA_R_16BF;
+}
+inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) {
+  switch (scalar_type) {
+    case c10::ScalarType::Byte:
+      return CUDA_R_8U;
+    case c10::ScalarType::Char:
+      return CUDA_R_8I;
+    case c10::ScalarType::Int:
+      return CUDA_R_32I;
+    case c10::ScalarType::Half:
+      return CUDA_R_16F;
+    case c10::ScalarType::Float:
+      return CUDA_R_32F;
+    case c10::ScalarType::Double:
+      return CUDA_R_64F;
+    case c10::ScalarType::ComplexHalf:
+      return CUDA_C_16F;
+    case c10::ScalarType::ComplexFloat:
+      return CUDA_C_32F;
+    case c10::ScalarType::ComplexDouble:
+      return CUDA_C_64F;
+    case c10::ScalarType::Short:
+      return CUDA_R_16I;
+    case c10::ScalarType::Long:
+      return CUDA_R_64I;
+    case c10::ScalarType::BFloat16:
+      return CUDA_R_16BF;
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    case c10::ScalarType::Float8_e4m3fn:
+      return CUDA_R_8F_E4M3;
+    case c10::ScalarType::Float8_e5m2:
+      return CUDA_R_8F_E5M2;
+#endif
+#if defined(USE_ROCM)
+#if defined(HIP_NEW_TYPE_ENUMS)
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return HIP_R_8F_E4M3_FNUZ;
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return HIP_R_8F_E5M2_FNUZ;
+#else
+    case c10::ScalarType::Float8_e4m3fnuz:
+      return static_cast<hipDataType>(1000);
+    case c10::ScalarType::Float8_e5m2fnuz:
+      return static_cast<hipDataType>(1001);
+#endif
+#endif
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.")
+  }
+}
+} // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDADevice.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+#include <ATen/cuda/Exceptions.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+namespace at::cuda {
+inline Device getDeviceFromPtr(void* ptr) {
+  cudaPointerAttributes attr{};
+  AT_CUDA_CHECK(cudaPointerGetAttributes(&attr, ptr));
+#if !defined(USE_ROCM)
+  TORCH_CHECK(attr.type != cudaMemoryTypeUnregistered,
+    "The specified pointer resides on host memory and is not registered with any CUDA device.");
+#endif
+  return {c10::DeviceType::CUDA, static_cast<DeviceIndex>(attr.device)};
+}
+} // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAUtils.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+namespace at::cuda {
+// Check if every tensor in a list of tensors matches the current
+// device.
+inline bool check_device(ArrayRef<Tensor> ts) {
+  if (ts.empty()) {
+    return true;
+  }
+  Device curDevice = Device(kCUDA, current_device());
+  for (const Tensor& t : ts) {
+    if (t.device() != curDevice) return false;
+  }
+  return true;
+}
+} // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/NumericLimits.cuh ADDED Viewed

	@@ -0,0 +1,121 @@

+#pragma once
+#include <cuda.h>
+#include <limits.h>
+#include <math.h>
+#include <float.h>
+// NumericLimits.cuh is a holder for numeric limits definitions of commonly used
+// types. This header is very specific to ROCm HIP and may be removed in the future.
+// This header is derived from the legacy THCNumerics.cuh.
+// The lower_bound and upper_bound constants are same as lowest and max for
+// integral types, but are -inf and +inf for floating point types. They are
+// useful in implementing min, max, etc.
+namespace at {
+template <typename T>
+struct numeric_limits {
+};
+// WARNING: the following at::numeric_limits definitions are there only to support
+//          HIP compilation for the moment. Use std::numeric_limits if you are not
+//          compiling for ROCm.
+//          from @colesbury: "The functions on numeric_limits aren't marked with
+//          __device__ which is why they don't work with ROCm. CUDA allows them
+//          because they're constexpr."
+namespace {
+  // ROCm doesn't like INFINITY too.
+  constexpr double inf = INFINITY;
+}
+template <>
+struct numeric_limits<bool> {
+  static inline __host__ __device__ bool lowest() { return false; }
+  static inline __host__ __device__ bool max() { return true; }
+  static inline __host__ __device__ bool lower_bound() { return false; }
+  static inline __host__ __device__ bool upper_bound() { return true; }
+};
+template <>
+struct numeric_limits<uint8_t> {
+  static inline __host__ __device__ uint8_t lowest() { return 0; }
+  static inline __host__ __device__ uint8_t max() { return UINT8_MAX; }
+  static inline __host__ __device__ uint8_t lower_bound() { return 0; }
+  static inline __host__ __device__ uint8_t upper_bound() { return UINT8_MAX; }
+};
+template <>
+struct numeric_limits<int8_t> {
+  static inline __host__ __device__ int8_t lowest() { return INT8_MIN; }
+  static inline __host__ __device__ int8_t max() { return INT8_MAX; }
+  static inline __host__ __device__ int8_t lower_bound() { return INT8_MIN; }
+  static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
+};
+template <>
+struct numeric_limits<int16_t> {
+  static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
+  static inline __host__ __device__ int16_t max() { return INT16_MAX; }
+  static inline __host__ __device__ int16_t lower_bound() { return INT16_MIN; }
+  static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
+};
+template <>
+struct numeric_limits<int32_t> {
+  static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
+  static inline __host__ __device__ int32_t max() { return INT32_MAX; }
+  static inline __host__ __device__ int32_t lower_bound() { return INT32_MIN; }
+  static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
+};
+template <>
+struct numeric_limits<int64_t> {
+#ifdef _MSC_VER
+  static inline __host__ __device__ int64_t lowest() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t max() { return _I64_MAX; }
+  static inline __host__ __device__ int64_t lower_bound() { return _I64_MIN; }
+  static inline __host__ __device__ int64_t upper_bound() { return _I64_MAX; }
+#else
+  static inline __host__ __device__ int64_t lowest() { return INT64_MIN; }
+  static inline __host__ __device__ int64_t max() { return INT64_MAX; }
+  static inline __host__ __device__ int64_t lower_bound() { return INT64_MIN; }
+  static inline __host__ __device__ int64_t upper_bound() { return INT64_MAX; }
+#endif
+};
+template <>
+struct numeric_limits<at::Half> {
+  static inline __host__ __device__ at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half max() { return at::Half(0x7BFF, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half lower_bound() { return at::Half(0xFC00, at::Half::from_bits()); }
+  static inline __host__ __device__ at::Half upper_bound() { return at::Half(0x7C00, at::Half::from_bits()); }
+};
+template <>
+struct numeric_limits<at::BFloat16> {
+  static inline __host__ __device__ at::BFloat16 lowest() { return at::BFloat16(0xFF7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 max() { return at::BFloat16(0x7F7F, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 lower_bound() { return at::BFloat16(0xFF80, at::BFloat16::from_bits()); }
+  static inline __host__ __device__ at::BFloat16 upper_bound() { return at::BFloat16(0x7F80, at::BFloat16::from_bits()); }
+};
+template <>
+struct numeric_limits<float> {
+  static inline __host__ __device__ float lowest() { return -FLT_MAX; }
+  static inline __host__ __device__ float max() { return FLT_MAX; }
+  static inline __host__ __device__ float lower_bound() { return -static_cast<float>(inf); }
+  static inline __host__ __device__ float upper_bound() { return static_cast<float>(inf); }
+};
+template <>
+struct numeric_limits<double> {
+  static inline __host__ __device__ double lowest() { return -DBL_MAX; }
+  static inline __host__ __device__ double max() { return DBL_MAX; }
+  static inline __host__ __device__ double lower_bound() { return -inf; }
+  static inline __host__ __device__ double upper_bound() { return inf; }
+};
+} // namespace at

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/PeerToPeerAccess.h ADDED Viewed

	@@ -0,0 +1,11 @@

+#include <c10/macros/Macros.h>
+#include <cstdint>
+namespace at::cuda {
+namespace detail {
+void init_p2p_access_cache(int64_t num_devices);
+}
+TORCH_CUDA_CPP_API bool get_p2p_access(int source_dev, int dest_dev);
+}  // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Sleep.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <c10/macros/Export.h>
+#include <cstdint>
+namespace at::cuda {
+// enqueues a kernel that spins for the specified number of cycles
+TORCH_CUDA_CU_API void sleep(int64_t cycles);
+// flushes instruction cache for ROCm; no-op for CUDA
+TORCH_CUDA_CU_API void flush_icache();
+}  // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/cub_definitions.cuh ADDED Viewed

	@@ -0,0 +1,53 @@

+#pragma once
+#if !defined(USE_ROCM)
+#include <cuda.h>  // for CUDA_VERSION
+#endif
+#if !defined(USE_ROCM)
+#include <cub/version.cuh>
+#else
+#define CUB_VERSION 0
+#endif
+// cub sort support for __nv_bfloat16 is added to cub 1.13 in:
+// https://github.com/NVIDIA/cub/pull/306
+#if CUB_VERSION >= 101300
+#define CUB_SUPPORTS_NV_BFLOAT16() true
+#else
+#define CUB_SUPPORTS_NV_BFLOAT16() false
+#endif
+// cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
+// https://github.com/NVIDIA/cub/pull/326
+// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
+// starting from CUDA 11.5
+#if defined(CUB_WRAPPED_NAMESPACE) || defined(THRUST_CUB_WRAPPED_NAMESPACE)
+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() true
+#else
+#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
+#endif
+// cub support for UniqueByKey is added to cub 1.16 in:
+// https://github.com/NVIDIA/cub/pull/405
+#if CUB_VERSION >= 101600
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() true
+#else
+#define CUB_SUPPORTS_UNIQUE_BY_KEY() false
+#endif
+// cub support for scan by key is added to cub 1.15
+// in https://github.com/NVIDIA/cub/pull/376
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_SCAN_BY_KEY() 1
+#else
+#define CUB_SUPPORTS_SCAN_BY_KEY() 0
+#endif
+// cub support for cub::FutureValue is added to cub 1.15 in:
+// https://github.com/NVIDIA/cub/pull/305
+#if CUB_VERSION >= 101500
+#define CUB_SUPPORTS_FUTURE_VALUE() true
+#else
+#define CUB_SUPPORTS_FUTURE_VALUE() false
+#endif

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h ADDED Viewed

	@@ -0,0 +1,58 @@

+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+#include <ATen/Generator.h>
+#include <optional>
+// TODO: No need to have this whole header, we can just put it all in
+// the cpp file
+namespace at::cuda::detail {
+// Set the callback to initialize Magma, which is set by
+// torch_cuda_cu. This indirection is required so magma_init is called
+// in the same library where Magma will be used.
+TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)());
+// The real implementation of CUDAHooksInterface
+struct CUDAHooks : public at::CUDAHooksInterface {
+  CUDAHooks(at::CUDAHooksArgs) {}
+  void initCUDA() const override;
+  Device getDeviceFromPtr(void* data) const override;
+  bool isPinnedPtr(const void* data) const override;
+  const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override;
+  bool hasCUDA() const override;
+  bool hasMAGMA() const override;
+  bool hasCuDNN() const override;
+  bool hasCuSOLVER() const override;
+  bool hasCuBLASLt() const override;
+  bool hasROCM() const override;
+  const at::cuda::NVRTC& nvrtc() const override;
+  DeviceIndex current_device() const override;
+  bool hasPrimaryContext(DeviceIndex device_index) const override;
+  Allocator* getCUDADeviceAllocator() const override;
+  Allocator* getPinnedMemoryAllocator() const override;
+  bool compiledWithCuDNN() const override;
+  bool compiledWithMIOpen() const override;
+  bool supportsDilatedConvolutionWithCuDNN() const override;
+  bool supportsDepthwiseConvolutionWithCuDNN() const override;
+  bool supportsBFloat16ConvolutionWithCuDNNv8() const override;
+  bool hasCUDART() const override;
+  long versionCUDART() const override;
+  long versionCuDNN() const override;
+  std::string showConfig() const override;
+  double batchnormMinEpsilonCuDNN() const override;
+  int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override;
+  void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override;
+  int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override;
+  void cuFFTClearPlanCache(DeviceIndex device_index) const override;
+  int getNumGPUs() const override;
+#ifdef USE_ROCM
+  bool isGPUArch(DeviceIndex device_index, const std::vector<std::string>& archs) const override;
+#endif
+  void deviceSynchronize(DeviceIndex device_index) const override;
+};
+} // at::cuda::detail

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h ADDED Viewed

	@@ -0,0 +1,151 @@

+// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states.
+// These handles are tied to device, and these libraries requires/recommends not to
+// share handles across host threads.
+//
+// These libraries recommend using one handle per host thread. We may not want to do
+// this because threads are relatively light-weight, but creating and destroying
+// handles is expensive (destroying the handle causes synchronizations). DataParallel,
+// for example, creates new threads for each forward pass.
+//
+// This file implements a handle pool mechanism. The handle pool returns handles on
+// demand as threads request them. If all existing handles in the pool are in use,
+// it creates a new one. As threads terminate, they release handles back into the pool.
+// In this way, the handle pool never creates more handles than the high-water mark of
+// active threads, so it's efficient with DataParallel.
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include <utility>
+#include <mutex>
+#include <memory>
+#include <c10/util/Exception.h>
+namespace at::cuda { namespace {
+template <typename Handle_t, void Create(Handle_t *), void Destroy(Handle_t)>
+struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThreadHandlePool<Handle_t, Create, Destroy>> {
+    struct Handle {
+    Handle_t handle;
+    Handle(bool create = false) : handle(nullptr)
+    {
+        if(create) Create(&handle);
+    }
+    // std::vector.emplace() and push_back() may route through temporaries and call
+    // copy/move constructors along the way.  If this is the case, we don't want
+    // the destructors of temporaries to call cudnnDestroy on the handle.
+    // We can achieve safety (for the narrow case of stashing within std::vectors)
+    // by making Handle moveable but not copyable, and transferring handle ownership
+    // to the latest constructed object.  This is not a substitute for full-blown
+    // reference counting, but reference counting may be overkill here.
+    // Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
+    // unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
+    Handle(const Handle& rhs) = delete;
+    // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
+    Handle(Handle&& rhs) noexcept : Handle() { std::swap(handle, rhs.handle); }
+    // operator= takes argument by value
+    Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
+    ~Handle() {
+        if(handle) Destroy(handle);
+    }
+    };
+    std::mutex mutex;
+    // Handles are lazily created as different threads request them,
+    // but are never destroyed until the end of the process.
+    // The maximum number of handles this process will create for each device is equal
+    // to the high-water mark of the number of concurrently active threads that request
+    // handles for that device.
+    // When threads terminate, they release their handles back into the pool for reuse.
+    // Otherwise, new handles would be created every time new threads were spawned,
+    // resulting in poor performance for Python modules that repeatedly or frequently
+    // spawned new sets of threads (like DataParallel, which creates a new set of threads
+    // for each forward pass).
+    //
+    // To prevent potential deadlocks, we explicitly choose not to cap the number
+    // of handles that are created per device.
+    // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
+    // only 4 can make forward progress at any time. The other 4 will not release their
+    // handles until they exit, so the fifth cannot make progress until then.  This is
+    // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
+    // intermediate point (ie, before any of them have exited).  We have no way to anticipate
+    // or enforce that user threads will not attempt such intermediate synchronization.
+    // The only way to ensure safety is to avoid imposing a cap on the number of handles.
+    std::unordered_map<int, std::vector<Handle>> created_handles;
+    std::unordered_map<int, std::vector<Handle_t>> available_handles;
+    // PoolWindow lazily creates and caches the handles that a particular thread is using,
+    // so in the common case handle access doesn't incur either handle creation or a mutex lock.
+    class PoolWindow
+    {
+    public:
+    PoolWindow(std::shared_ptr<DeviceThreadHandlePool> parent): weak_parent(std::move(parent)) {}
+    ~PoolWindow(){ release(); }
+    Handle_t reserve(int device)
+    {
+        // If this thread already has a handle for this device, return it
+        if(my_handles.find(device) != my_handles.end())
+        return my_handles[device];
+        // otherwise, either grab a handle from the pool if one is available,
+        // or if not, create a new one.
+        auto parent = weak_parent.lock();
+        TORCH_CHECK(parent, "Cannot create handle during program termination");
+        std::lock_guard<std::mutex> guard(parent->mutex);
+        if(parent->available_handles[device].size() > 0)
+        {
+        my_handles[device] = parent->available_handles[device].back();
+        parent->available_handles[device].pop_back();
+        }
+        else
+        {
+        // In local testing, I do observe that emplace_back sometimes routes through temporaries
+        // that incur move-constructor and destructor calls.  See comments in Handle above.
+        parent->created_handles[device].emplace_back(true /*create*/);
+        my_handles[device] = parent->created_handles[device].back().handle;
+        }
+        return my_handles[device];
+    }
+    private:
+    // Stores the per-device handles currently owned by this thread
+    std::unordered_map<int, Handle_t> my_handles;
+    std::weak_ptr<DeviceThreadHandlePool> weak_parent;
+    // Called by the destructor.  Releases this thread's handles back into the pool.
+    void release() {
+        if(my_handles.size() > 0) {
+            auto parent = weak_parent.lock();
+            if (!parent) {
+                // If this thread exits after atexit handlers have completed, the
+                // cuda context itself may be invalid, so we must leak the handles.
+                return;
+            }
+            std::lock_guard<std::mutex> guard(parent->mutex);
+            for(auto d_h : my_handles)
+                parent->available_handles[d_h.first].push_back(d_h.second);
+        }
+    }
+    };
+    // Warning:
+    // If you want to change this function, be aware that this function will be called
+    // by multiple threads and there is no mutex guarding the call of this function, so
+    // make sure your implementation is thread-safe.
+    PoolWindow *newPoolWindow() {
+        // The returned pointer will be owned by a thread local variable
+        // so that different threads does not share the same PoolWindow.
+        return new PoolWindow(this->shared_from_this());
+    }
+};
+}}  // namespace at::cuda::detail::<anonymous>

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh ADDED Viewed

	@@ -0,0 +1,124 @@

+#pragma once
+#include <assert.h>
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+#include <cuda_runtime.h>
+#endif
+namespace at::cuda::detail {
+// A utility class to implement integer division by multiplication, given a fixed
+// divisor.
+//
+// WARNING: The fast divider algorithm is only implemented for unsigned int;
+//          otherwise we default to plain integer division.  For unsigned int,
+//          we further assume that the dividend is at most INT32_MAX.  Thus,
+//          IntDivider must NOT be used for general integer division.
+//
+//          This reduced range is enough for our purpose, and it allows us to
+//          slightly simplify the computation.
+//
+// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
+//
+// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
+// <= m < 2^(N+1)) and shift s such that:
+//
+//    \floor(n / d) = \floor((m * n) / 2^(N+s)).
+//
+// Given such m and s, the integer division can be then implemented as:
+//
+//    let m' = m - 2^N  // 0 <= m' < 2^N
+//
+//    fast_integer_division(n):
+//      // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
+//      // integer.  Then take the higher N bits.
+//      t = (m' * n) >> N
+//
+//      // Here we use the fact that n is less than 2^(N-1): otherwise the value
+//      // of (t + n) may not fit in an N-bit integer.
+//      return (t + n) >> s
+//
+// Finding such a magic number is surprisingly easy:
+//
+//    s  = \ceil(\log_2 d)
+//    m' = \floor(2^N * (2^s - d) / d) + 1  // Need 2N-bit integer arithmetic.
+//
+// See also:
+//    - Division by Invariant Integers Using Multiplication,
+//      Torbjörn Granlund and Peter L. Montgomery, 1994.
+//
+//    - http://www.hackersdelight.org/magic.htm
+//
+//    - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+// Result of div/mod operation stored together.
+template <typename Value>
+struct DivMod {
+  Value div, mod;
+  C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
+};
+// Base case: we only have an implementation for uint32_t for now.  For
+// everything else, we use plain division.
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) { }
+  C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
+  C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
+  C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
+    return DivMod<Value>(n / divisor, n % divisor);
+  }
+  Value divisor;
+};
+// Implement fast integer division.
+template <>
+struct IntDivider<unsigned int> {
+  static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");
+  IntDivider() = default;
+  IntDivider(unsigned int d) : divisor(d) {
+    assert(divisor >= 1 && divisor <= INT32_MAX);
+    // TODO: gcc/clang has __builtin_clz() but it's not portable.
+    for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;
+    uint64_t one = 1;
+    uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+    m1 = magic;
+    assert(m1 > 0 && m1 == magic);  // m1 must fit in 32 bits.
+  }
+  C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
+    // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
+    // 'm1'.
+    unsigned int t = __umulhi(n, m1);
+    return (t + n) >> shift;
+#else
+    // Using uint64_t so that the addition does not overflow.
+    uint64_t t = ((uint64_t) n * m1) >> 32;
+    return (t + n) >> shift;
+#endif
+  }
+  C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
+    return n - div(n) * divisor;
+  }
+  C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
+    unsigned int q = div(n);
+    return DivMod<unsigned int>(q, n - q * divisor);
+  }
+  unsigned int divisor;  // d above.
+  unsigned int m1;  // Magic number: m' above.
+  unsigned int shift;  // Shift amounts.
+};
+}  // namespace at::cuda::detail

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/KernelUtils.h ADDED Viewed

	@@ -0,0 +1,37 @@

+#pragma once
+#include <limits>
+#include <c10/util/Exception.h>
+namespace at::cuda::detail {
+// CUDA: grid stride looping
+//
+// int64_t _i_n_d_e_x specifically prevents overflow in the loop increment.
+// If input.numel() < INT_MAX, _i_n_d_e_x < INT_MAX, except after the final
+// iteration of the loop where _i_n_d_e_x += blockDim.x * gridDim.x can be
+// greater than INT_MAX.  But in that case _i_n_d_e_x >= n, so there are no
+// further iterations and the overflowed value in i=_i_n_d_e_x is not used.
+#define CUDA_KERNEL_LOOP_TYPE(i, n, index_type)                         \
+  int64_t _i_n_d_e_x = blockIdx.x * blockDim.x + threadIdx.x;           \
+  for (index_type i=_i_n_d_e_x; _i_n_d_e_x < (n); _i_n_d_e_x+=blockDim.x * gridDim.x, i=_i_n_d_e_x)
+#define CUDA_KERNEL_LOOP(i, n) CUDA_KERNEL_LOOP_TYPE(i, n, int)
+// Use 1024 threads per block, which requires cuda sm_2x or above
+constexpr int CUDA_NUM_THREADS = 1024;
+// CUDA: number of blocks for threads.
+inline int GET_BLOCKS(const int64_t N, const int64_t max_threads_per_block=CUDA_NUM_THREADS) {
+  TORCH_INTERNAL_ASSERT(N > 0, "CUDA kernel launch blocks must be positive, but got N=", N);
+  constexpr int64_t max_int = std::numeric_limits<int>::max();
+  // Round up division for positive number that cannot cause integer overflow
+  auto block_num = (N - 1) / max_threads_per_block + 1;
+  TORCH_INTERNAL_ASSERT(block_num <= max_int, "Can't schedule too many blocks on CUDA device");
+  return static_cast<int>(block_num);
+}
+}  // namespace at::cuda::detail

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/LazyNVRTC.h ADDED Viewed

	@@ -0,0 +1,11 @@

+#pragma once
+#include <ATen/detail/CUDAHooksInterface.h>
+namespace at::cuda {
+// Forward-declares at::cuda::NVRTC
+struct NVRTC;
+namespace detail {
+extern NVRTC lazyNVRTC;
+} // namespace detail
+}  // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh ADDED Viewed

	@@ -0,0 +1,43 @@

+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxCudaState.h>, which has a #pragma once.
+// Stores RNG state values. Passed as a kernel argument.
+// See Note [CUDA Graph-safe RNG states].
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+namespace at {
+struct PhiloxCudaState {
+  PhiloxCudaState() = default;
+  // Called if graph capture is not underway
+  PhiloxCudaState(uint64_t seed,
+                  uint64_t offset) {
+    seed_.val = seed;
+    offset_.val = offset;
+  }
+  // Called if graph capture is underway
+  PhiloxCudaState(int64_t* seed,
+                  int64_t* offset_extragraph,
+                  uint32_t offset_intragraph) {
+    seed_.ptr = seed;
+    offset_.ptr = offset_extragraph;
+    offset_intragraph_ = offset_intragraph;
+    captured_ = true;
+  }
+  // Public members, directly accessible by at::cuda::philox::unpack.
+  // If we made them private with getters/setters, the getters/setters
+  // would have to be __device__, and we can't declare __device__ in ATen.
+  union Payload {
+    uint64_t val;
+    int64_t* ptr;
+  };
+  Payload seed_{};
+  Payload offset_{};
+  uint32_t offset_intragraph_ = 0;
+  bool captured_ = false;
+};
+} // namespace at

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh ADDED Viewed

	@@ -0,0 +1,116 @@

+#pragma once
+#include <ATen/CollapseDims.h>
+namespace at::cuda::detail {
+#define MAX_TENSORINFO_DIMS 25
+// CUDA kernel argument that defines tensor layout
+template <typename T, typename IndexType>
+struct TensorInfo {
+  TensorInfo();
+  TensorInfo(T* p,
+             int dim,
+             IndexType sz[MAX_TENSORINFO_DIMS],
+             IndexType st[MAX_TENSORINFO_DIMS]);
+  // Set the size of the given dimension to 1, as if it were a
+  // reduction dim (allows you to calculate offsets of the reduction
+  // slice)
+  void reduceDim(int dim);
+  // See note on [collapse dims].
+  int collapseDims(const int excludeDim = -1);
+  // Contiguous tensors of more than one dimension are collapsed down
+  // to one tensor
+  __host__ __device__ inline bool isContiguous() const {
+    return (dims == 1 && strides[0] == 1);
+  }
+  T* data;
+  IndexType sizes[MAX_TENSORINFO_DIMS];
+  IndexType strides[MAX_TENSORINFO_DIMS];
+  int dims;
+};
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo() {
+  data = nullptr;
+  dims = 0;
+}
+template <typename T, typename IndexType>
+TensorInfo<T, IndexType>::TensorInfo(T* p,
+                                     int dim,
+                                     IndexType sz[MAX_TENSORINFO_DIMS],
+                                     IndexType st[MAX_TENSORINFO_DIMS]) {
+  data = p;
+  dims = dim;
+  TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions");
+  for (int i = 0; i < dim; ++i) {
+    sizes[i] = sz[i];
+    strides[i] = st[i];
+  }
+}
+template <typename T, typename IndexType>
+void
+TensorInfo<T, IndexType>::reduceDim(int dim) {
+  TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1");
+  sizes[dim] = 1;
+}
+template <typename T, typename IndexType>
+int
+TensorInfo<T, IndexType>::collapseDims(const int excludeDim) {
+  auto result = at::collapse_dims(sizes, strides, dims, excludeDim);
+  dims = std::get<1>(result);
+  return std::get<0>(result);
+}
+// Translate a linear index for the apply to a T* offset;
+// specialized on `Dims` to reduce nvcc compilation time
+template <typename T, typename IndexType, int Dims>
+struct IndexToOffset {
+  static __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+    IndexType offset = 0;
+    // Uses static dims
+    for (int i = Dims - 1; i > 0; --i) {
+      IndexType curDimIndex = linearId % info.sizes[i];
+      IndexType curDimOffset = curDimIndex * info.strides[i];
+      offset += curDimOffset;
+      linearId /= info.sizes[i];
+    }
+    return offset + linearId * info.strides[0];
+  }
+};
+// Uses dynamic (runtime) instead of static (compiletime) dims
+template <typename T, typename IndexType>
+struct IndexToOffset<T, IndexType, -1> {
+  static inline __host__ __device__ IndexType get(
+    IndexType linearId,
+    const TensorInfo<T, IndexType>& info) {
+      IndexType offset = 0;
+      for (int i = info.dims - 1; i > 0; --i) {
+        IndexType curDimIndex = linearId % info.sizes[i];
+        IndexType curDimOffset = curDimIndex * info.strides[i];
+        offset += curDimOffset;
+        linearId /= info.sizes[i];
+      }
+      return offset + linearId * info.strides[0];
+  }
+};
+} // namespace at::cuda::detail

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/UnpackRaw.cuh ADDED Viewed

	@@ -0,0 +1,28 @@

+// No "#pragma once" because this is a raw definition that can be copied by jit codegen.
+// Eager mode clients should not include this file directly, instead,
+// they should #include <ATen/cuda/PhiloxUtils.cuh>, which has a #pragma once.
+namespace at::cuda::philox {
+// In-kernel call to retrieve philox seed and offset from a PhiloxCudaState instance whether
+// that instance was created with graph capture underway or not.
+// See Note [CUDA Graph-safe RNG states].
+//
+// We can't write a __device__ function in CUDAGeneratorImpl.h, because it's in ATen.
+// Also, whatever call unpacks PhiloxCudaState in consumer kernels must be inlineable.
+// Easiest thing that comes to mind is, define a __device__ unpack helper here, in ATen/cuda.
+//
+// The raw definition lives in its own file so jit codegen can easily copy it.
+__host__ __device__ __forceinline__ std::tuple<uint64_t, uint64_t>
+unpack(at::PhiloxCudaState arg) {
+  if (arg.captured_) {
+    // static_cast avoids "warning: invalid narrowing conversion from "long" to "unsigned long".
+    // *(arg.offset_.ptr) is a broadcast load of a single int64_t to the entire kernel.
+    // For most threads' reads it will hit in cache, so it shouldn't hurt performance.
+    return std::make_tuple(static_cast<uint64_t>(*arg.seed_.ptr), static_cast<uint64_t>(*(arg.offset_.ptr) + arg.offset_intragraph_));
+  } else {
+    return std::make_tuple(arg.seed_.val, arg.offset_.val);
+  }
+}
+} // namespace at::cuda::philox

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/llvm_jit_strings.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <string>
+#include <c10/macros/Export.h>
+namespace at::cuda {
+TORCH_CUDA_CPP_API const std::string &get_traits_string();
+TORCH_CUDA_CPP_API const std::string &get_cmath_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_body_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_half_body_string();
+TORCH_CUDA_CPP_API const std::string &get_complex_math_string();
+} // namespace at::cuda

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h ADDED Viewed

	@@ -0,0 +1,397 @@

+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <string>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/util/StringUtil.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/allclose.h>
+#include <ATen/ops/from_blob.h>
+#endif
+namespace at::cuda::tunable {
+enum class BlasOp {
+  N = 0,
+  T = 1
+};
+inline std::string BlasOpToString(BlasOp op) {
+  switch (op) {
+    case BlasOp::N:
+      return "N";
+    case BlasOp::T:
+      return "T";
+  }
+  TORCH_CHECK(false, "unrecognized BlasOp");
+  return "N";
+}
+namespace detail {
+static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) {
+  auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA);
+  // comparison done as 1D tensor
+  at::Tensor ref = at::from_blob(c,       {size}, options);
+  at::Tensor oth = at::from_blob(other_c, {size}, options);
+  at::Tensor ref_float = ref.to(at::kFloat);
+  at::Tensor oth_float = oth.to(at::kFloat);
+  std::vector<double> atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  std::vector<double> rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5};
+  double last_succeed_atol = 1;
+  double last_succeed_rtol = 1;
+  for (auto& atol : atols) {
+    for (auto& rtol : rtols) {
+      if (at::allclose(ref_float, oth_float, rtol, atol)) {
+        last_succeed_atol = atol;
+        last_succeed_rtol = rtol;
+      }
+    }
+  }
+  if (last_succeed_atol == 1) {
+    return false;
+  }
+  else {
+    TUNABLE_LOG3("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol);
+  }
+  return true;
+}
+}
+template <typename T>
+struct GemmParams : OpParams {
+  GemmParams() {
+    duplicate_inputs_ = false;
+  }
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+  size_t GetSizeA() const {
+    return sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
+  }
+  size_t GetSizeB() const {
+    return sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
+  }
+  size_t GetSizeC() const {
+    return sizeof(T) * ldc * n;
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  GemmParams* DeepCopy(bool duplicate_inputs) const {
+    GemmParams* copy = new GemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(GemmParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
+  }
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  const T* b;
+  int64_t ldb;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+private:
+  bool duplicate_inputs_;
+};
+template <typename T>
+struct GemmAndBiasParams : OpParams {
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = sizeof(T) * ldc * n;
+    if (duplicate_inputs) {
+      size += sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
+      size += sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
+    }
+    return size;
+  }
+  GemmAndBiasParams* DeepCopy(bool duplicate_inputs) const {
+    GemmAndBiasParams* copy = new GemmAndBiasParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = ldc * n * sizeof(T);
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
+      size_t b_size = sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(GemmAndBiasParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
+  }
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  const T* b;
+  int64_t ldb;
+  T* c;
+  int64_t ldc;
+  const T* bias;
+  at::cuda::blas::GEMMAndBiasActivationEpilogue activation;
+private:
+  bool duplicate_inputs_;
+};
+template <typename T>
+struct GemmStridedBatchedParams : OpParams {
+  GemmStridedBatchedParams() {
+    duplicate_inputs_ = false;
+  }
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch);
+  }
+  size_t GetSizeA() const {
+    return sizeof(T) * std::min(lda, stride_a) * ((transa == 'n' || transa == 'N') ? k : m) * batch;
+  }
+  size_t GetSizeB() const {
+    return sizeof(T) * std::min(ldb, stride_b) * ((transb == 'n' || transb == 'N') ? n : k) * batch;
+  }
+  size_t GetSizeC() const {
+    return sizeof(T) * std::min(ldc, stride_c) * n * batch;
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  GemmStridedBatchedParams* DeepCopy(bool duplicate_inputs) const {
+    GemmStridedBatchedParams* copy = new GemmStridedBatchedParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
+      copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    return detail::NumericalCheck(c_dtype, c, other->c, batch*stride_c) ? OK : FAIL;
+  }
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  at::opmath_type<T> alpha;
+  const T* a;
+  int64_t lda;
+  int64_t stride_a;
+  const T* b;
+  int64_t ldb;
+  int64_t stride_b;
+  at::opmath_type<T> beta;
+  T* c;
+  int64_t ldc;
+  int64_t stride_c;
+  int64_t batch;
+private:
+  bool duplicate_inputs_;
+};
+template <typename T>
+struct ScaledGemmParams : OpParams {
+  ScaledGemmParams() {
+    duplicate_inputs_ = false;
+  }
+  std::string Signature() const override {
+    return c10::str(transa, transb, "_", m, "_", n, "_", k);
+  }
+  size_t GetSizeA() const {
+    return sizeof(T) * lda * ((transa == 'n' || transa == 'N') ? k : m);
+  }
+  size_t GetSizeB() const {
+    return sizeof(T) * ldb * ((transb == 'n' || transb == 'N') ? n : k);
+  }
+  size_t GetSizeC() const {
+    return sizeof(T) * ldc * n;
+  }
+  size_t GetSize(bool duplicate_inputs) const {
+    size_t size = GetSizeC();
+    if (duplicate_inputs) {
+      size += GetSizeA();
+      size += GetSizeB();
+    }
+    return size;
+  }
+  ScaledGemmParams* DeepCopy(bool duplicate_inputs) const {
+    ScaledGemmParams* copy = new ScaledGemmParams;
+    *copy = *this;
+    c10::DeviceIndex device = 0;
+    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
+    size_t c_size = GetSizeC();
+    copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size);
+    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
+        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
+    if (duplicate_inputs) {
+      size_t a_size = GetSizeA();
+      size_t b_size = GetSizeB();
+      copy->a = c10::cuda::CUDACachingAllocator::raw_alloc(a_size);
+      copy->b = c10::cuda::CUDACachingAllocator::raw_alloc(b_size);
+      copy->duplicate_inputs_ = true;
+    }
+    return copy;
+  }
+  // only call on object returned by DeepCopy
+  void Delete() {
+    c10::cuda::CUDACachingAllocator::raw_delete(c);
+    if (duplicate_inputs_) {
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(a));
+      c10::cuda::CUDACachingAllocator::raw_delete(const_cast<void*>(b));
+    }
+  }
+  TuningStatus NumericalCheck(ScaledGemmParams<T> *other) {
+    return detail::NumericalCheck(c_dtype, c, other->c, ldc*n) ? OK : FAIL;
+  }
+  char transa;
+  char transb;
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  const void* a;
+  const void* a_scale_ptr;
+  int64_t lda;
+  ScalarType a_dtype;
+  const void* b;
+  const void* b_scale_ptr;
+  int64_t ldb;
+  ScalarType b_dtype;
+  const void* bias_ptr;
+  ScalarType bias_dtype;
+  void* c;
+  const void* c_scale_ptr;
+  int64_t ldc;
+  ScalarType c_dtype;
+  void* amax_ptr;
+  bool use_fast_accum;
+private:
+  bool duplicate_inputs_;
+};
+} // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h ADDED Viewed

	@@ -0,0 +1,611 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDADataType.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/StringUtil.h>
+#include <hipblaslt/hipblaslt.h>
+#include <hipblaslt/hipblaslt-ext.hpp>
+#define TORCH_HIPBLASLT_CHECK(EXPR)               \
+  do {                                            \
+    hipblasStatus_t __err = EXPR;                 \
+    TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS,  \
+                "hipblaslt error: ",              \
+                hipblasStatusToString(__err),     \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+namespace at::cuda::tunable {
+template <typename T>
+constexpr hipblasDatatype_t HipDataTypeFor();
+template <>
+constexpr hipblasDatatype_t HipDataTypeFor<float>() {
+  return HIP_R_32F;
+}
+template <>
+constexpr hipblasDatatype_t HipDataTypeFor<Half>() {
+  return HIP_R_16F;
+}
+template <>
+constexpr hipblasDatatype_t HipDataTypeFor<BFloat16>() {
+  return HIP_R_16BF;
+}
+template <>
+constexpr hipblasDatatype_t HipDataTypeFor<double>() {
+  return HIP_R_64F;
+}
+template <>
+constexpr hipblasDatatype_t HipDataTypeFor<c10::Float8_e4m3fnuz>() {
+  return HIP_R_8F_E4M3_FNUZ;
+}
+template <>
+constexpr hipblasDatatype_t HipDataTypeFor<c10::Float8_e5m2fnuz>() {
+  return HIP_R_8F_E5M2_FNUZ;
+}
+template <typename T>
+int GetBatchFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetBatchFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetBatchFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->batch;
+}
+template <typename T>
+int GetBatchFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideAFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideAFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideAFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_a;
+}
+template <typename T>
+int GetStrideAFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideBFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideBFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideBFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_b;
+}
+template <typename T>
+int GetStrideBFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideCFromParams(const GemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideCFromParams(const GemmAndBiasParams<T>* params) {
+  return 1;
+}
+template <typename T>
+int GetStrideCFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->stride_c;
+}
+template <typename T>
+int GetStrideCFromParams(const ScaledGemmParams<T>* params) {
+  return 1;
+}
+template <typename T>
+float GetAlphaFromParams(const GemmParams<T>* params) {
+  return params->alpha;
+}
+template <typename T>
+float GetAlphaFromParams(const GemmAndBiasParams<T>* params) {
+  return params->alpha;
+}
+template <typename T>
+float GetAlphaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->alpha;
+}
+template <typename T>
+float GetAlphaFromParams(const ScaledGemmParams<T>* params) {
+  return 1.0;
+}
+template <typename T>
+float GetBetaFromParams(const GemmParams<T>* params) {
+  return params->beta;
+}
+template <typename T>
+float GetBetaFromParams(const GemmAndBiasParams<T>* params) {
+  return 0.0;
+}
+template <typename T>
+float GetBetaFromParams(const GemmStridedBatchedParams<T>* params) {
+  return params->beta;
+}
+template <typename T>
+float GetBetaFromParams(const ScaledGemmParams<T>* params) {
+  return 0.0;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetAScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->a_scale_ptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->b_scale_ptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmAndBiasParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetDScalePointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->c_scale_ptr;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmAndBiasParams<T>* params) {
+  return params->bias;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const GemmStridedBatchedParams<T>* params) {
+  return nullptr;
+}
+template <typename T>
+const void* GetBiasPointerFromParams(const ScaledGemmParams<T>* params) {
+  return params->bias_ptr;
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmParams<T>* params) {
+  return HIP_R_32F;
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmAndBiasParams<T>* params) {
+  return HipDataTypeFor<T>();
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams<T>* params) {
+  return HIP_R_32F;
+}
+template <typename T>
+hipDataType GetBiasTypeFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype);
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmAndBiasParams<T>* params) {
+  return params->activation;
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const GemmStridedBatchedParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+template <typename T>
+at::cuda::blas::GEMMAndBiasActivationEpilogue GetActivationFromParams(const ScaledGemmParams<T>* params) {
+  return at::cuda::blas::GEMMAndBiasActivationEpilogue::None;
+}
+static hipblasOperation_t _hipblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return HIPBLAS_OP_N;
+    case 't':
+    case 'T':
+      return HIPBLAS_OP_T;
+    case 'c':
+    case 'C':
+      return HIPBLAS_OP_C;
+  }
+  AT_ERROR(
+      "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+static char _charFromhipblasOp(hipblasOperation_t op) {
+  switch (op) {
+    case HIPBLAS_OP_N:
+      return 'N';
+    case HIPBLAS_OP_T:
+      return 'T';
+    case HIPBLAS_OP_C:
+      return 'C';
+  }
+  AT_ERROR(
+      "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`");
+}
+static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
+  if (layout == BlasOp::N) {
+    return HIPBLAS_OP_N;
+  }
+  return HIPBLAS_OP_T;
+}
+static size_t GetHipblasltWorkspaceSize() {
+  static const char * env = getenv("HIPBLASLT_WORKSPACE_SIZE");
+  // 256MB is max workspace size allowed for hipblaslt
+  // hipblaslt-bench uses 32MB
+  // recommendation from hipblaslt author was 76MB
+  size_t workspace_size = 32*1024;  // going with 32MB
+  if (env) {
+    try {
+      workspace_size = std::stoi(env);
+    } catch(std::invalid_argument const& e) {
+      TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
+                 " using default workspace size of ", workspace_size, " KiB.");
+    } catch(std::out_of_range const& e) {
+      TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
+                 " using default workspace size of ", workspace_size, " KiB.");
+    }
+  }
+  return workspace_size * 1024;
+}
+template <typename T, cublasStatus_t (*destructor)(T*)>
+struct HipBlasLtDeleter {
+  void operator()(T* x) {
+    if (x != nullptr) {
+      TORCH_CUDABLAS_CHECK(destructor(x));
+    }
+  }
+};
+template <typename T, hipblasStatus_t (*destructor)(T*)>
+class HipBlasLtDescriptor {
+ public:
+  T* descriptor() const {
+    return descriptor_.get();
+  }
+  T* descriptor() {
+    return descriptor_.get();
+  }
+ protected:
+  std::unique_ptr<T, HipBlasLtDeleter<T, destructor>> descriptor_;
+};
+class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor<
+                                     hipblasLtMatmulDescOpaque_t,
+                                     &hipblasLtMatmulDescDestroy> {
+ public:
+  HipBlasLtMatmulDescriptor(
+      hipblasComputeType_t compute_type,
+      hipDataType scale_type) {
+    hipblasLtMatmulDesc_t raw_descriptor = nullptr;
+    TORCH_HIPBLASLT_CHECK(
+        hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type));
+    descriptor_.reset(raw_descriptor);
+  }
+  template <typename T>
+  inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) {
+    TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T)));
+  }
+};
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+class HipblasltGemmOp : public Callable<ParamsT> {
+  public:
+    HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {}
+    TuningStatus Call(const ParamsT* params) override {
+      hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+      hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+      auto a_datatype = HipDataTypeFor<AT>();
+      auto b_datatype = HipDataTypeFor<BT>();
+      auto in_out_datatype = HipDataTypeFor<CT>();
+      auto opa = _hipblasOpFromChar(params->transa);
+      auto opb = _hipblasOpFromChar(params->transb);
+      TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen");
+      float alpha = GetAlphaFromParams<CT>(params);
+      float beta = GetBetaFromParams<CT>(params);
+      hipblasLtMatrixLayout_t mat_a, mat_b, mat_c;
+      if (opa == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda));
+      }
+      if (opb == HIPBLAS_OP_N) {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb));
+      }
+      else {
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb));
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc));
+      // specific to batched gemmm
+      int batch = GetBatchFromParams<CT>(params);
+      if (batch > 1) {
+        int64_t stride_a = GetStrideAFromParams<CT>(params);
+        int64_t stride_b = GetStrideBFromParams<CT>(params);
+        int64_t stride_c = GetStrideCFromParams<CT>(params);
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch)));
+        TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute(
+            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
+      }
+      HipBlasLtMatmulDescriptor matmul(HIPBLAS_COMPUTE_32F, HIP_R_32F);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
+      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
+      // specific to scaled gemm
+      const void* mat1_scale_ptr = GetAScalePointerFromParams<CT>(params);
+      const void* mat2_scale_ptr = GetBScalePointerFromParams<CT>(params);
+      const void* result_scale_ptr = GetDScalePointerFromParams<CT>(params);
+      if (mat1_scale_ptr && mat2_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
+      }
+      if (result_scale_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+      }
+      const void* bias_ptr = GetBiasPointerFromParams<CT>(params);
+      auto bias_datatype = GetBiasTypeFromParams<CT>(params);
+      if (bias_ptr) {
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr);
+        matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype);
+        auto activation = GetActivationFromParams<CT>(params);
+        if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::RELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_RELU_BIAS);
+        }
+        else if (activation == at::cuda::blas::GEMMAndBiasActivationEpilogue::GELU) {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_GELU_BIAS);
+        }
+        else {
+          matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS);
+        }
+      }
+      size_t workspace_size = GetHipblasltWorkspaceSize();
+      auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
+      size_t ret_workspace_size = 0;
+      auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle,
+          matmul.descriptor(),
+          &alpha,
+          mat_a,
+          mat_b,
+          &beta,
+          mat_c,
+          mat_c,
+          algo_,
+          ret_workspace_size);
+      if (status == HIPBLAS_STATUS_SUCCESS) {
+        if (ret_workspace_size >= workspace_size) {
+          return FAIL;
+        }
+      }
+      else {
+        return FAIL;
+      }
+      void* workspace_buffer = nullptr;
+      if (workspace_size > 0) {
+        workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size);
+      }
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
+            matmul.descriptor(),
+            &alpha,
+            params->a,
+            mat_a,
+            params->b,
+            mat_b,
+            &beta,
+            params->c,
+            mat_c,
+            params->c,
+            mat_c,
+            &algo_,
+            workspace_buffer,
+            workspace_size,
+            at::cuda::getCurrentCUDAStream()));
+      //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
+      TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
+      if (workspace_size > 0) {
+        c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer);
+      }
+      return OK;
+    }
+  private:
+    hipblasLtMatmulAlgo_t algo_;
+};
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout, typename ParamsT>
+auto GetHipBlasLtTypeStringAndOps() {
+  hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout);
+  hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout);
+  auto a_datatype = HipDataTypeFor<AT>();
+  auto b_datatype = HipDataTypeFor<BT>();
+  auto in_out_datatype = HipDataTypeFor<CT>();
+  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
+  hipblasLtHandle_t handle;
+  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
+  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
+        hipblaslt_ext::GemmType::HIPBLASLT_GEMM,
+        transa_outer,
+        transb_outer,
+        a_datatype,
+        b_datatype,
+        in_out_datatype,
+        in_out_datatype,
+        HIPBLAS_COMPUTE_32F,
+        heuristic_result));
+  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));
+  // Sort heuristic_result by algo index to make sure the order of returned algos is deterministic.
+  std::sort(heuristic_result.begin(),
+      heuristic_result.end(),
+      [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) {
+      return hipblaslt_ext::getIndexFromAlgo(a.algo) < hipblaslt_ext::getIndexFromAlgo(b.algo);
+      });
+  int returned_algo_count = heuristic_result.size();
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<ParamsT>>>> ret;
+  for (int i = 0; i < returned_algo_count; i++) {
+    auto algo = heuristic_result[i].algo;
+    int algo_index = hipblaslt_ext::getIndexFromAlgo(algo);
+    auto callable = std::make_unique<HipblasltGemmOp<AT, BT, CT, ALayout, BLayout, ParamsT>>(algo);
+    std::string type_string = c10::str(
+        "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index);
+    ret.emplace_back(type_string, std::move(callable));
+  }
+  return ret;
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmParams<T>>();
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmAndBiasTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmAndBiasParams<T>>();
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<T, T, T, ALayout, BLayout, GemmStridedBatchedParams<T>>();
+}
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+auto GetHipBlasLtScaledGemmTypeStringAndOps() {
+  return GetHipBlasLtTypeStringAndOps<AT, BT, CT, ALayout, BLayout, ScaledGemmParams<CT>>();
+}
+#undef TORCH_HIPBLASLT_CHECK
+}  // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmRocblas.h ADDED Viewed

	@@ -0,0 +1,275 @@

+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <ATen/cuda/tunable/GemmCommon.h>
+#include <c10/util/StringUtil.h>
+#define ROCBLAS_BETA_FEATURES_API
+#include <rocblas/rocblas.h>
+#define TORCH_ROCBLAS_CHECK(EXPR)                 \
+  do {                                            \
+    rocblas_status __err = EXPR;                  \
+    TORCH_CHECK(__err == rocblas_status_success,  \
+                "rocblas error: ",                \
+                rocblas_status_to_string(__err),  \
+                " when calling `" #EXPR "`");     \
+  } while (0)
+namespace at::cuda::tunable {
+template <typename T>
+constexpr rocblas_datatype RocBlasDataTypeFor();
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<Half>() {
+  return rocblas_datatype_f16_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<BFloat16>() {
+  return rocblas_datatype_bf16_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+template <>
+constexpr rocblas_datatype RocBlasDataTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+template <typename T>
+constexpr rocblas_datatype RocBlasComputeTypeFor();
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<float>() {
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<double>() {
+  return rocblas_datatype_f64_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<Half>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // FP16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<BFloat16>() {
+  // Note that we're returning the _compute_ type for a given datatype.
+  // As of 12/2022, using compute type FP16 for 16-bit floats was much
+  // slower than using compute type FP32. So we use FP32 compute even for
+  // BF16 datatypes. This is how GEMM is implemented even in the function
+  // rocblasGemmHelper (see fpgeneric.h)
+  return rocblas_datatype_f32_r;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<float>>() {
+  return rocblas_datatype_f32_c;
+}
+template <>
+constexpr rocblas_datatype RocBlasComputeTypeFor<c10::complex<double>>() {
+  return rocblas_datatype_f64_c;
+}
+template <typename T>
+auto DoCastForHalfOrBfloat16(const T fp) {
+  return fp;
+}
+template <>
+inline auto DoCastForHalfOrBfloat16<Half>(const Half fp) {
+  // alpha and beta should be the same as compute_type, in Half case it is float.
+  float h = fp;
+  return h;
+}
+template <>
+inline auto DoCastForHalfOrBfloat16<BFloat16>(const BFloat16 fp) {
+  // alpha and beta should be the same as compute_type, in bfloat16 case it is float.
+  float h = fp;
+  return h;
+}
+static rocblas_operation _rocblasOpFromChar(char op) {
+  switch (op) {
+    case 'n':
+    case 'N':
+      return rocblas_operation_none;
+    case 't':
+    case 'T':
+      return rocblas_operation_transpose;
+    case 'c':
+    case 'C':
+      return rocblas_operation_conjugate_transpose;
+  }
+  AT_ERROR(
+      "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`");
+}
+template <typename T>
+class RocblasGemmOp : public Callable<GemmParams<T>> {
+  public:
+    RocblasGemmOp(int solution) : solution_{solution} {}
+    TuningStatus Call(const GemmParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda,
+          params->b, input_output_type, params->ldb,
+          &h_b,
+          params->c, input_output_type, params->ldc,
+          params->c, input_output_type, params->ldc,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+  private:
+    int solution_;
+};
+template <typename T>
+auto GetRocBlasGemmTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+template <typename T>
+class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {}
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      auto input_output_type = RocBlasDataTypeFor<T>();
+      auto compute_type = RocBlasComputeTypeFor<T>();
+      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
+      auto h_b = DoCastForHalfOrBfloat16(params->beta);
+      auto status = rocblas_gemm_strided_batched_ex(
+          (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(),
+          _rocblasOpFromChar(params->transa),
+          _rocblasOpFromChar(params->transb),
+          params->m, params->n, params->k,
+          &h_a,
+          params->a, input_output_type, params->lda, params->stride_a,
+          params->b, input_output_type, params->ldb, params->stride_b,
+          &h_b,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->c, input_output_type, params->ldc, params->stride_c,
+          params->batch,
+          compute_type,
+          rocblas_gemm_algo_solution_index,
+          solution_,
+          rocblas_gemm_flags_none);
+      if (status != rocblas_status_success) {
+        return FAIL;
+      }
+      return OK;
+    }
+  private:
+    int solution_;
+};
+template <typename T>
+auto GetRocBlasGemmStridedBatchedTypeStringAndOps() {
+  rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle();
+  int solution_size;
+  auto input_output_type = RocBlasDataTypeFor<T>();
+  auto compute_type = RocBlasComputeTypeFor<T>();
+  // Get the number of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            nullptr,
+                                                            &solution_size));
+  std::vector<int> solutions(solution_size);
+  // Get the list of available solutions
+  TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle,
+                                                            input_output_type,
+                                                            input_output_type,
+                                                            compute_type,
+                                                            rocblas_gemm_flags_none,
+                                                            solutions.data(),
+                                                            &solution_size));
+  // Sort the solutions in ascending order to make the solution vector deterministic across runs
+  std::sort(solutions.begin(), solutions.end());
+  std::vector<std::pair<std::string, std::unique_ptr<Callable<GemmStridedBatchedParams<T>>>>> ret;
+  for (size_t i = 0; i < solutions.size(); ++i) {
+    auto callable = std::make_unique<RocblasGemmStridedBatchedOp<T>>(solutions[i]);
+    ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable)));
+  }
+  return ret;
+}
+}  // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h ADDED Viewed

	@@ -0,0 +1,34 @@

+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <cuda_runtime.h>
+#include <ATen/cuda/tunable/Tunable.h>
+namespace at::cuda::tunable {
+class StreamTimer : public ITimer {
+  public:
+    StreamTimer();
+    virtual ~StreamTimer() override;
+    void Start() override;
+    void End() override;
+    float Duration() override;
+  private:
+    cudaEvent_t start_;
+    cudaEvent_t end_;
+};
+} // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/Tunable.h ADDED Viewed

	@@ -0,0 +1,246 @@

+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <c10/util/CallOnce.h>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+namespace at::cuda::tunable {
+namespace detail {
+struct MaybeDelete {
+  bool owns_pointer;
+  void operator()(std::ostream* os) const { if (owns_pointer) delete os; }
+};
+using OstreamPtr = std::unique_ptr<std::ostream, MaybeDelete>;
+static OstreamPtr get_stream(std::string filename) {
+  if (filename.compare("out") == 0) {
+    return OstreamPtr { &std::cout, MaybeDelete {false} };
+  }
+  else if (filename.compare("err") == 0) {
+    return OstreamPtr { &std::cerr, MaybeDelete {false} };
+  }
+  else {
+    return OstreamPtr { new std::ofstream {filename.c_str()}, MaybeDelete {true} };
+  }
+}
+}
+static void TunableLog(int level, const std::string& msg) {
+  static const char *env_file = getenv("PYTORCH_TUNABLEOP_VERBOSE_FILENAME");
+  static const char *env_verbose = getenv("PYTORCH_TUNABLEOP_VERBOSE");
+  static int level_user = env_verbose ? atoi(env_verbose) : 0;
+  static auto streamptr = detail::get_stream(env_file ? env_file : "err");
+  if (level_user >= level) {
+    (*streamptr) << msg <<std::endl;
+  }
+}
+#define TUNABLE_LOGV(LEVEL, ...) TunableLog(LEVEL, c10::str(__VA_ARGS__))
+#define TUNABLE_LOG1(...) TUNABLE_LOGV(1, __VA_ARGS__)
+#define TUNABLE_LOG2(...) TUNABLE_LOGV(2, __VA_ARGS__)
+#define TUNABLE_LOG3(...) TUNABLE_LOGV(3, __VA_ARGS__)
+enum TORCH_CUDA_CPP_API TuningStatus {
+  OK = 0,
+  FAIL = 1,
+  UNSUPPORTED = 2,
+};
+// Mapping from params signature to kernel id
+class TORCH_CUDA_CPP_API ResultEntry {
+  public:
+    explicit ResultEntry(const std::string& key, double time) : key_(key), time_(time) {}
+    bool operator==(const ResultEntry& other) { return key_ == other.key_; }
+    bool operator!=(const ResultEntry& other) { return key_ != other.key_; }
+    operator std::string () { return key_; }
+    std::string GetKey() const { return key_; }
+    double GetTime() const { return time_; }
+    friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry);
+    static ResultEntry Null() { return ResultEntry("Null", 0.0); }
+    static ResultEntry Default() { return ResultEntry("Default", 0.0); }
+  private:
+    std::string key_;
+    double time_;
+};
+typedef std::unordered_map<std::string, ResultEntry> KernelMap;
+typedef std::unordered_map<std::string, KernelMap> ResultsMap;
+struct TORCH_CUDA_CPP_API TuningResults {
+  // Validates if these results are compatible with the libraries
+  std::unordered_map<std::string, std::string> validators;
+  // Mapping from Callable signature to Callable's tuning result
+  ResultsMap results;
+};
+class TORCH_CUDA_CPP_API TuningResultsManager {
+  public:
+    TuningResultsManager() = default;
+    ~TuningResultsManager() = default;
+    KernelMap Lookup(const std::string& op_signature);
+    ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
+    inline void AddImpl(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best,
+        KernelMap& kernel_map);
+    void Add(const std::string& op_signature,
+        const std::string& params_signature,
+        ResultEntry best);
+    void Delete(const std::string& op_signature, const std::string& params_signature);
+    inline void DisjointMergeImpl(
+        const std::string& op_signature,
+        const KernelMap& kernel_map,
+        /*out*/ ResultsMap& results);
+    void Load(const ResultsMap& results_to_load);
+    ResultsMap Dump();
+    void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map);
+    size_t GetSize();
+  private:
+    std::mutex lock_;
+    ResultsMap results_;
+};
+class TORCH_CUDA_CPP_API TuningResultsValidator {
+  public:
+    using GetFunc = std::function<std::string()>;
+    using ValidateFunc = std::function<TuningStatus(const std::string&)>;
+    using GetValidateFuncs = std::unordered_map<std::string, std::pair<GetFunc, ValidateFunc>>;
+    TuningResultsValidator();
+    ~TuningResultsValidator() = default;
+    std::unordered_map<std::string, std::string> GetAllValidators() const;
+    TuningStatus ValidateAll(const std::unordered_map<std::string, std::string>& to_validate) const;
+    void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
+  protected:
+    std::string GetPyTorchVersion() const;
+    TuningStatus ValidatePyTorchVersion(const std::string& value) const;
+  public:
+    static constexpr const std::array mandatory_keys{"PT_VERSION"};
+  private:
+    GetValidateFuncs validators_;
+};
+class TORCH_CUDA_CPP_API TuningContext {
+  public:
+    TuningContext();
+    ~TuningContext();
+    TuningContext(TuningContext &) = delete;
+    TuningContext(TuningContext &&) = delete;
+    TuningContext &operator=(TuningContext &) = delete;
+    TuningContext &operator=(TuningContext &&) = delete;
+    void EnableTunableOp(bool value);
+    bool IsTunableOpEnabled() const;
+    void EnableTuning(bool value);
+    bool IsTuningEnabled() const;
+    void EnableNumericsCheck(bool value);
+    bool IsNumericsCheckEnabled() const;
+    void SetMaxTuningDurationMs(int max_duration_ms);
+    int GetMaxTuningDurationMs() const;
+    void SetMaxTuningIterations(int max_iter);
+    int GetMaxTuningIterations() const;
+    void SetMaxWarmupDurationMs(int max_duration_ms);
+    int GetMaxWarmupDurationMs() const;
+    void SetMaxWarmupIterations(int max_iter);
+    int GetMaxWarmupIterations() const;
+    void EnableICacheFlush(bool value);
+    bool IsICacheFlushEnabled() const;
+    void SetRotatingBufferSize(int size);
+    int GetRotatingBufferSize() const;
+    TuningResultsManager& GetTuningResultsManager();
+    TuningResultsValidator& GetTuningResultsValidator();
+    TuningResults GetTuningResults();
+    TuningStatus LoadTuningResults(const TuningResults& tr);
+    void SetFilename(const std::string& filename, bool insert_device_ordinal=false);
+    std::string GetFilename() const;
+    void WriteFileOnExit(bool value);
+    bool ReadFile(const std::string& filename={});
+    bool WriteFile(const std::string& filename={});
+  private:
+    bool enable_;
+    bool tuning_enable_;
+    bool manager_initialized_;
+    bool write_file_on_exit_;
+    bool numerics_check_enable_;
+    int max_tuning_duration_ms_;
+    int max_tuning_iterations_;
+    int max_warmup_duration_ms_;
+    int max_warmup_iterations_;
+    bool icache_flush_;
+    int rotating_buffer_size_;
+    mutable TuningResultsManager manager_;
+    mutable c10::once_flag manager_init_once_;
+    TuningResultsValidator validator_;
+    std::string filename_;
+    size_t results_count_from_input_file_;
+};
+TORCH_CUDA_CPP_API TuningContext* getTuningContext();
+class ITimer {
+  public:
+    ITimer() = default;
+    virtual ~ITimer() = default;
+    virtual void Start() = 0;
+    virtual void End() = 0;
+    /// Computes the elapsed time in milliseconds between Start() and End()
+    virtual float Duration() = 0;
+};
+} // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/TunableGemm.h ADDED Viewed

	@@ -0,0 +1,307 @@

+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <ATen/cuda/tunable/GemmCommon.h>
+#ifdef USE_ROCM
+#include <ATen/cuda/tunable/GemmHipblaslt.h>
+#include <ATen/cuda/tunable/GemmRocblas.h>
+#endif
+#include <ATen/cuda/tunable/StreamTimer.h>
+#include <ATen/cuda/tunable/TunableOp.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/StringUtil.h>
+namespace at::cuda::tunable {
+template <typename T>
+class DefaultGemmOp : public Callable<GemmParams<T>> {
+  public:
+    TuningStatus Call(const GemmParams<T>* params) override {
+      at::cuda::blas::gemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->beta,
+          params->c, params->ldc);
+      return OK;
+    }
+};
+static bool _transposeBoolFromChar(char op) {
+  return op == 't' || op == 'T';
+}
+template <typename T>
+class DefaultGemmAndBiasOp : public Callable<GemmAndBiasParams<T>> {
+  public:
+    TuningStatus Call(const GemmAndBiasParams<T>* params) override {
+      at::cuda::blas::gemm_and_bias<T>(
+          _transposeBoolFromChar(params->transa),
+          _transposeBoolFromChar(params->transb),
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda,
+          params->b, params->ldb,
+          params->bias,
+          params->c, params->ldc,
+          params->activation);
+      return OK;
+    }
+};
+template <typename T>
+class DefaultGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>> {
+  public:
+    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
+      at::cuda::blas::bgemm_internal<T>(
+          params->transa, params->transb,
+          params->m, params->n, params->k,
+          params->alpha,
+          params->a, params->lda, params->stride_a,
+          params->b, params->ldb, params->stride_b,
+          params->beta,
+          params->c, params->ldc, params->stride_c,
+          params->batch);
+      return OK;
+    }
+};
+template <typename T>
+class DefaultScaledGemmOp : public Callable<ScaledGemmParams<T>> {
+  public:
+    TuningStatus Call(const ScaledGemmParams<T>* params) override {
+      at::cuda::blas::scaled_gemm(
+          params->transa,
+          params->transb,
+          params->m,
+          params->n,
+          params->k,
+          params->a,
+          params->a_scale_ptr,
+          params->lda,
+          params->a_dtype,
+          params->b,
+          params->b_scale_ptr,
+          params->ldb,
+          params->b_dtype,
+          params->bias_ptr,
+          params->bias_dtype,
+          params->c,
+          params->c_scale_ptr,
+          params->ldc,
+          params->c_dtype,
+          params->amax_ptr,
+          params->use_fast_accum);
+      return OK;
+    }
+};
+template <typename T>
+inline bool IsZero(T v) {
+  return v == 0.0f;
+}
+template <>
+inline bool IsZero(BFloat16 v) {
+  return v.x == 0;
+}
+template <>
+inline bool IsZero(Half v) {
+  return float(v) == 0.0f;
+}
+template <>
+inline bool IsZero(c10::complex<double> v) {
+  return v == 0.0;
+}
+template <>
+inline bool IsZero(c10::complex<float> v) {
+  return v == 0.0f;
+}
+template <typename T>
+inline std::string TypeName(T v) {
+  return "unknown";
+}
+template <>
+inline std::string TypeName(float v) {
+  return "float";
+}
+template <>
+inline std::string TypeName(double v) {
+  return "double";
+}
+template <>
+inline std::string TypeName(BFloat16 v) {
+  return "BFloat16";
+}
+template <>
+inline std::string TypeName(Half v) {
+  return "Half";
+}
+template <>
+inline std::string TypeName(Float8_e4m3fn v) {
+  return "Float8_e4m3fn";
+}
+template <>
+inline std::string TypeName(Float8_e5m2 v) {
+  return "Float8_e5m2";
+}
+template <>
+inline std::string TypeName(Float8_e4m3fnuz v) {
+  return "Float8_e4m3fnuz";
+}
+template <>
+inline std::string TypeName(Float8_e5m2fnuz v) {
+  return "Float8_e5m2fnuz";
+}
+template <>
+inline std::string TypeName(c10::complex<double> v) {
+  return "c10::complex<double>";
+}
+template <>
+inline std::string TypeName(c10::complex<float> v) {
+  return "c10::complex<float>";
+}
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmTunableOp : public TunableOp<GemmParams<T>, StreamTimer> {
+ public:
+  GemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmOp<T>>());
+#ifdef USE_ROCM
+    static const char *env_rocblas = std::getenv("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (env_rocblas == nullptr || strcmp(env_rocblas, "1") == 0) {
+      for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps<T>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+    }
+    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+  }
+  std::string Signature() override {
+    return c10::str("GemmTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmAndBiasTunableOp : public TunableOp<GemmAndBiasParams<T>, StreamTimer> {
+ public:
+  GemmAndBiasTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmAndBiasOp<T>>());
+#ifdef USE_ROCM
+    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmAndBiasTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+  }
+  std::string Signature() override {
+    return c10::str("GemmAndBiasTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+template <typename T, BlasOp ALayout, BlasOp BLayout>
+class GemmStridedBatchedTunableOp : public TunableOp<GemmStridedBatchedParams<T>, StreamTimer> {
+ public:
+  GemmStridedBatchedTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultGemmStridedBatchedOp<T>>());
+#ifdef USE_ROCM
+    static const char *env_rocblas = std::getenv("PYTORCH_TUNABLEOP_ROCBLAS_ENABLED");
+    if (env_rocblas == nullptr || strcmp(env_rocblas, "1") == 0) {
+      for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps<T>()) {
+        this->RegisterOp(std::move(name), std::move(op));
+      }
+    }
+    static const char *env_hipblaslt = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED");
+    if (env_hipblaslt == nullptr || strcmp(env_hipblaslt, "1") == 0) {
+      // disallow tuning of hipblaslt with c10::complex
+      if constexpr (
+          !std::is_same_v<T, c10::complex<float>> &&
+          !std::is_same_v<T, c10::complex<double>>) {
+        for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps<T, ALayout, BLayout>()) {
+          this->RegisterOp(std::move(name), std::move(op));
+        }
+      }
+    }
+#endif
+  }
+  std::string Signature() override {
+    return c10::str("GemmStridedBatchedTunableOp_", TypeName<T>(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+template <typename AT, typename BT, typename CT, BlasOp ALayout, BlasOp BLayout>
+class ScaledGemmTunableOp : public TunableOp<ScaledGemmParams<CT>, StreamTimer> {
+ public:
+  ScaledGemmTunableOp() {
+    this->RegisterOp(std::string("Default"), std::make_unique<DefaultScaledGemmOp<CT>>());
+#ifdef USE_ROCM
+    for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps<AT, BT, CT, ALayout, BLayout>()) {
+      this->RegisterOp(std::move(name), std::move(op));
+    }
+#endif
+  }
+  std::string Signature() override {
+    return c10::str("ScaledGemmTunableOp",
+            "_", TypeName<AT>(AT{}),
+            "_", TypeName<BT>(BT{}),
+            "_", TypeName<CT>(CT{}),
+            "_", BlasOpToString(ALayout), BlasOpToString(BLayout));
+  }
+};
+} // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/TunableOp.h ADDED Viewed

	@@ -0,0 +1,286 @@

+// Original TunableOp is from onnxruntime.
+// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h
+// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+//
+// Adapting TunableOp into PyTorch
+// Copyright (c) Advanced Micro Devices, Inc.
+//
+#pragma once
+#include <ATen/cuda/tunable/Tunable.h>
+#include <ATen/cuda/Sleep.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#ifndef _WIN32
+#include <cxxabi.h>
+#endif
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+namespace at::cuda::tunable {
+template <typename ParamsT>
+class Callable {
+  public:
+    Callable() = default;
+    Callable(Callable&&) = default;
+    virtual ~Callable() = default;
+    virtual TuningStatus Call(const ParamsT*) {
+      return FAIL;
+    }
+    virtual TuningStatus IsSupported(const ParamsT* params) {
+      return Call(params);
+    }
+};
+template <typename ParamsT, typename TimerT>
+class TunableOp {
+  public:
+    TunableOp() = default;
+    TunableOp(TunableOp&&) = default;
+    virtual ~TunableOp() = default;
+    TuningStatus operator()(const ParamsT* params) {
+      ResultEntry result = ResultEntry::Null();
+      TuningContext* ctx = getTuningContext();
+      if (ctx->IsTunableOpEnabled()) {
+        auto& mgr = ctx->GetTuningResultsManager();
+        auto op_sig = Signature();
+        auto params_sig = params->Signature();
+        result = mgr.Lookup(op_sig, params_sig);
+        // If there is not previous tuning result been found, we do the tuning iff tuning is enabled
+        if (result == ResultEntry::Null() && ctx->IsTuningEnabled()) {
+          result = FindFastest(params);
+          mgr.Add(op_sig, params_sig, result);
+        }
+      }
+      else {
+        result = ResultEntry::Default();
+      }
+      if (result == ResultEntry::Null()) {
+        TUNABLE_LOG2("no result, using default");
+        result = ResultEntry::Default();
+      }
+      auto iter = ops_.find(result);
+      TORCH_CHECK(iter != ops_.end());
+      return iter->second->Call(params);
+    }
+    virtual std::string Signature() {
+      // According to C++17 standard https://wg21.link/n4659 section 15.7.4
+      // > if the operand of typeid refers to the
+      // > object under construction or destruction, typeid yields the std::type_info object representing the constructor
+      // > or destructor’s class.
+      // So delay the op signature generation.
+      c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); });
+      return signature_;
+    }
+  protected:
+    void RegisterOp(const std::string& name, std::unique_ptr<Callable<ParamsT>> op) {
+      this->op_names_.emplace_back(name);
+      this->ops_.emplace(name, std::move(op));
+    }
+  private:
+    static void WarmUp(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      for (size_t i = 0; i < num_iter; i++) {
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+    }
+    static double Profile(Callable<ParamsT> *op, const std::vector<ParamsT*> &param, size_t num_iter, size_t &offset) {
+      TuningContext* ctx = getTuningContext();
+      bool do_flush = ctx->IsICacheFlushEnabled();
+      TimerT timer{};
+      timer.Start();
+      for (size_t i = 0; i < num_iter; i++) {
+        if (do_flush) {
+          at::cuda::flush_icache();
+        }
+        TORCH_CHECK(op->Call(param[(i+offset++)%param.size()]) == OK);
+      }
+      timer.End();
+      return timer.Duration() / num_iter;
+    }
+  protected:
+    virtual ResultEntry FindFastest(const ParamsT* params) {
+      TuningContext* ctx = getTuningContext();
+      auto op_sig = Signature();
+      auto params_sig = params->Signature();
+      TUNABLE_LOG2("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates");
+      auto min_duration_ms = std::numeric_limits<double>::infinity();
+      std::string id_name = "Default";
+      ParamsT* reference_params = nullptr;
+      // numeric check option is controlled by non-static env var, so check it once per tuned operator
+      bool do_numerics_check = ctx->IsNumericsCheckEnabled();
+      // calcaulte a reference answer for numerical check
+      if (do_numerics_check) {
+        reference_params = params->DeepCopy(false);
+        TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK);
+      }
+      // need copies of params to reuse
+      // make as many copies as will fill the requested rotating buffer size, if requested
+      // rotating_size guaranteed to be >= 0 even though GetRotatingBufferSize() returns int
+      size_t rotating_size = ctx->GetRotatingBufferSize();
+      bool use_buffer_rotation = (rotating_size > 0);
+      size_t param_size = params->GetSize(use_buffer_rotation);
+      size_t param_count = (rotating_size / param_size) + 1;
+      constexpr size_t MB = 1024*1024;
+      if (use_buffer_rotation) {
+        TUNABLE_LOG2("Rotating buffer ", rotating_size/MB, " MiB. ",
+            "Needed Size: ", param_size/MB, " MiB. ",
+            "Needed number of param copies: ", param_count);
+      }
+      TORCH_CHECK(param_count > 0);
+      std::vector<ParamsT*> reusable_params(param_count);
+      for (size_t i = 0; i < param_count; i++) {
+        reusable_params[i] = params->DeepCopy(use_buffer_rotation);
+      }
+      // for rotating buffer
+      size_t offset = 0;
+      for (size_t i = 0; i < op_names_.size(); i++) {
+        auto* candidate = ops_[op_names_[i]].get(); // borrow pointer
+        if (do_numerics_check) {
+          ParamsT* numerical_params = params->DeepCopy(false);
+          auto status = candidate->Call(numerical_params);
+          if (status != OK) {
+            numerical_params->Delete();
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+          status = reference_params->NumericalCheck(numerical_params);
+          numerical_params->Delete();
+          if (status != OK) {
+            TUNABLE_LOG3("├──numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+        else {
+          auto status = candidate->Call(reusable_params[0]);
+          if (status != OK) {
+            TUNABLE_LOG3("├──unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+            continue;
+          }
+        }
+        // collect a small profile
+        constexpr const int approx_num_iter = 3;
+        auto approx_duration = Profile(candidate, reusable_params, approx_num_iter, offset);
+        // bail if too slow
+        if (approx_duration > 2 * min_duration_ms) {
+          TUNABLE_LOG3("├──skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]);
+          continue;
+        }
+        // for warmup does user set max duration, max iters, or both?
+        // warmup is allowed to be skipped by setting either iterations or duration to 0
+        double max_warmup_duration = ctx->GetMaxWarmupDurationMs();
+        int max_warmup_iter = ctx->GetMaxWarmupIterations();
+        int warmup_iter = 1; // default
+        if (max_warmup_duration >= 0) {
+          int duration_iters = max_warmup_duration / approx_duration;
+          if (max_warmup_iter >= 0) {
+            warmup_iter = std::min(max_warmup_iter, duration_iters);
+          }
+          else {
+            warmup_iter = duration_iters;
+          }
+        }
+        else if (max_warmup_iter >= 0) {
+          warmup_iter = max_warmup_iter;
+        }
+        // for tuning does user set max duration, max iters, or both?
+        double max_tuning_duration = ctx->GetMaxTuningDurationMs();
+        int max_tuning_iter = ctx->GetMaxTuningIterations();
+        int tuning_iter = 100; // default
+        if (max_tuning_duration > 0) {
+          int duration_iters = max_tuning_duration / approx_duration;
+          if (max_tuning_iter > 0) {
+            tuning_iter = std::min(max_tuning_iter, duration_iters);
+          }
+          else {
+            tuning_iter = duration_iters;
+          }
+        }
+        else if (max_tuning_iter > 0) {
+          tuning_iter = max_tuning_iter;
+        }
+        // tuning must run at least 1 iteration
+        tuning_iter = std::max(1, tuning_iter);
+        // do the full warmup followed by tuning
+        double warmup_ms = warmup_iter * approx_duration;
+        double tuning_ms = tuning_iter * approx_duration;
+        TUNABLE_LOG3("├──tuning using "
+            "warmup iters ", warmup_iter, " [", warmup_ms, " ms] "
+            "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ",
+            "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]);
+        TUNABLE_LOG3("├──offset at ", offset);
+        WarmUp(candidate, reusable_params, warmup_iter, offset);
+        auto duration_ms = Profile(candidate, reusable_params, tuning_iter, offset);
+        if (duration_ms < min_duration_ms) {
+          TUNABLE_LOG3("├──found better instance id=", i, ". " , duration_ms, "ms. ", op_names_[i]);
+          min_duration_ms = duration_ms;
+          id_name = op_names_[i];
+        }
+      }
+      for (size_t i = 0; i < reusable_params.size(); i++) {
+        reusable_params[i]->Delete();
+      }
+      if (reference_params) {
+        reference_params->Delete();
+      }
+      TUNABLE_LOG2("└──found fastest for ", op_sig, '(', params_sig, ") ", id_name);
+      return ResultEntry(id_name, min_duration_ms);
+    }
+  private:
+    std::string CreateSignature() {
+#ifndef _WIN32
+      const auto* name = typeid(*this).name();
+      char buf[256];
+      size_t buf_len = 256;
+      abi::__cxa_demangle(name, buf, &buf_len, nullptr);
+      buf[255] = '\0';
+      return buf;
+#else
+      return typeid(*this).name();
+#endif
+    }
+    mutable c10::once_flag signature_init_once_;
+    std::string signature_;
+    std::unordered_map<std::string, std::unique_ptr<Callable<ParamsT>>> ops_;
+    std::vector<std::string> op_names_;
+};
+struct OpParams {
+  OpParams() {}
+  virtual ~OpParams() = default;
+  virtual std::string Signature() const = 0;
+};
+} // namespace at::cuda::tunable

.venv/lib/python3.11/site-packages/torch/include/ATen/native/Activation.h ADDED Viewed

	@@ -0,0 +1,98 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/Exception.h>
+#include <c10/util/string_view.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+class TensorBase;
+}
+namespace at::native {
+// These constants control the approximation behavior of gelu function.
+enum class GeluType {
+  None,             // Baseline Gelu
+  Tanh,             // Tahn Gelu Approximation
+  END
+};
+inline GeluType get_gelutype_enum(const c10::string_view approximate) {
+  if (approximate == "none") {
+    return GeluType::None;
+  } else if (approximate == "tanh") {
+    return GeluType::Tanh;
+  } else {
+    TORCH_CHECK(false, "approximate argument must be either none or tanh.");
+  }
+}
+inline std::string gelutype_to_string(const GeluType type) {
+  switch(type) {
+    case GeluType::None: return "none";
+    case GeluType::Tanh: return "tanh";
+    default: TORCH_CHECK(false, "unknown GELU type: ", static_cast<int>(type));
+  }
+}
+using structured_activation_fn = void (*)(TensorIteratorBase&);
+using structured_activation_backward_fn = void (*)(TensorIteratorBase&);
+using activation_fn = void (*)(TensorIterator&);
+using activation_backward_fn = void (*)(TensorIterator&);
+using softplus_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using softplus_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using threshold_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&);
+using hardtanh_backward_fn = void (*)(TensorIterator&, const c10::Scalar&, const c10::Scalar&);
+using hardsigmoid_fn = void(*)(TensorIteratorBase&);
+using hardsigmoid_backward_fn = void(*)(TensorIteratorBase&);
+using hardswish_fn = void(*)(TensorIterator&);
+using hardswish_backward_fn = void(*)(TensorIterator&);
+using shrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using softshrink_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using shrink_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using elu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&);
+using elu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&, const c10::Scalar&, const c10::Scalar&, bool);
+using leaky_relu_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using leaky_relu_backward_fn = void (*)(TensorIteratorBase&, const c10::Scalar&);
+using log_sigmoid_cpu_fn = void (*)(TensorBase&, TensorBase&, const TensorBase&);
+using gelu_fn = void (*)(TensorIteratorBase&, GeluType);
+using gelu_backward_fn = void (*)(TensorIteratorBase&, GeluType);
+using glu_jvp_fn = void (*)(TensorIteratorBase&);
+DECLARE_DISPATCH(elu_fn, elu_stub);
+DECLARE_DISPATCH(elu_backward_fn, elu_backward_stub);
+DECLARE_DISPATCH(softplus_fn, softplus_stub);
+DECLARE_DISPATCH(softplus_backward_fn, softplus_backward_stub);
+DECLARE_DISPATCH(log_sigmoid_cpu_fn, log_sigmoid_cpu_stub);
+DECLARE_DISPATCH(activation_backward_fn, log_sigmoid_backward_stub);
+DECLARE_DISPATCH(threshold_fn, threshold_stub);
+DECLARE_DISPATCH(gelu_fn, GeluKernel);
+DECLARE_DISPATCH(gelu_backward_fn, GeluBackwardKernel);
+DECLARE_DISPATCH(hardtanh_backward_fn, hardtanh_backward_stub);
+DECLARE_DISPATCH(hardsigmoid_fn, hardsigmoid_stub);
+DECLARE_DISPATCH(hardsigmoid_backward_fn, hardsigmoid_backward_stub);
+DECLARE_DISPATCH(hardswish_fn, hardswish_stub);
+DECLARE_DISPATCH(hardswish_backward_fn, hardswish_backward_stub);
+DECLARE_DISPATCH(shrink_fn, hardshrink_stub);
+DECLARE_DISPATCH(softshrink_fn, softshrink_stub);
+DECLARE_DISPATCH(shrink_backward_fn, shrink_backward_stub);
+DECLARE_DISPATCH(leaky_relu_fn, leaky_relu_stub);
+DECLARE_DISPATCH(leaky_relu_backward_fn, leaky_relu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, glu_stub);
+DECLARE_DISPATCH(activation_backward_fn, glu_backward_stub);
+DECLARE_DISPATCH(glu_jvp_fn, glu_jvp_stub);
+DECLARE_DISPATCH(structured_activation_fn, silu_stub);
+DECLARE_DISPATCH(structured_activation_backward_fn, silu_backward_stub);
+DECLARE_DISPATCH(structured_activation_fn, mish_stub);
+DECLARE_DISPATCH(activation_backward_fn, mish_backward_stub);
+DECLARE_DISPATCH(activation_fn, prelu_stub);
+DECLARE_DISPATCH(activation_backward_fn, prelu_backward_stub);
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/AdaptivePooling.h ADDED Viewed

	@@ -0,0 +1,49 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <cmath>
+namespace at::native {
+using adaptive_avg_pooling2d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling2d_fn, adaptive_avg_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling2d_backward_fn, adaptive_avg_pool2d_backward_kernel);
+using adaptive_max_pooling2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling2d_fn, adaptive_max_pool2d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling2d_backward_fn, adaptive_max_pool2d_backward_kernel);
+using adaptive_avg_pooling3d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size);
+using adaptive_avg_pooling3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output);
+DECLARE_DISPATCH(adaptive_avg_pooling3d_fn, adaptive_avg_pool3d_kernel);
+DECLARE_DISPATCH(adaptive_avg_pooling3d_backward_fn, adaptive_avg_pool3d_backward_kernel);
+using adaptive_max_pooling3d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size);
+using adaptive_max_pooling3d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices);
+DECLARE_DISPATCH(adaptive_max_pooling3d_fn, adaptive_max_pool3d_kernel);
+DECLARE_DISPATCH(adaptive_max_pooling3d_backward_fn, adaptive_max_pool3d_backward_kernel);
+inline int64_t start_index(int64_t a, int64_t b, int64_t c) {
+  return (a / b) * c + ((a % b) * c) / b;
+}
+inline int64_t end_index(int64_t a, int64_t b, int64_t c) {
+  return 1 + ((a + 1) * c - 1) / b;
+}
+inline void adaptive_pool_empty_output_check(const Tensor& gradOutput_, const char* arg_name) {
+  int64_t ndim = gradOutput_.ndimension();
+  for (const auto i : c10::irange(1, ndim)) {
+    TORCH_CHECK(gradOutput_.size(i) > 0,
+      arg_name, "(): Expected grad_output to have non-zero size for non-batch dimensions, "
+      "but grad_output has sizes ", gradOutput_.sizes(), " with dimension ", i,
+      " being empty");
+  }
+}
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h ADDED Viewed

	@@ -0,0 +1,321 @@

+#pragma once
+#include <optional>
+#include <c10/util/string_view.h>
+#include <ATen/Config.h>
+#include <ATen/native/DispatchStub.h>
+// Forward declare TI
+namespace at {
+class Tensor;
+struct TensorIterator;
+namespace native {
+enum class TransposeType;
+}
+}
+namespace at::native {
+enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss};
+#if AT_BUILD_WITH_LAPACK()
+// Define per-batch functions to be used in the implementation of batched
+// linear algebra operations
+template <class scalar_t>
+void lapackCholesky(char uplo, int n, scalar_t *a, int lda, int *info);
+template <class scalar_t>
+void lapackCholeskyInverse(char uplo, int n, scalar_t *a, int lda, int *info);
+template <class scalar_t, class value_t=scalar_t>
+void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *w, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, value_t *rwork, int *info);
+template <class scalar_t>
+void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+template <class scalar_t>
+void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info);
+template <class scalar_t>
+void lapackOrmqr(char side, char trans, int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *c, int ldc, scalar_t *work, int lwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackSyevd(char jobz, char uplo, int n, scalar_t* a, int lda, value_t* w, scalar_t* work, int lwork, value_t* rwork, int lrwork, int* iwork, int liwork, int* info);
+template <class scalar_t>
+void lapackGels(char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsd(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t* work, int lwork,
+    value_t *rwork, int* iwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelsy(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    int *jpvt, value_t rcond, int *rank,
+    scalar_t *work, int lwork, value_t* rwork, int *info);
+template <class scalar_t, class value_t = scalar_t>
+void lapackGelss(int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    value_t *s, value_t rcond, int *rank,
+    scalar_t *work, int lwork,
+    value_t *rwork, int *info);
+template <LapackLstsqDriverType, class scalar_t, class value_t = scalar_t>
+struct lapackLstsq_impl;
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gels, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGels<scalar_t>(
+        trans, m, n, nrhs,
+        a, lda, b, ldb,
+        work, lwork, info);
+  }
+};
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsy, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsy<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        jpvt, rcond, rank,
+        work, lwork, rwork, info);
+  }
+};
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelsd, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelsd<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, iwork, info);
+  }
+};
+template <class scalar_t, class value_t>
+struct lapackLstsq_impl<LapackLstsqDriverType::Gelss, scalar_t, value_t> {
+  static void call(
+      char trans, int m, int n, int nrhs,
+      scalar_t *a, int lda, scalar_t *b, int ldb,
+      scalar_t *work, int lwork, int *info, // Gels flavor
+      int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+      value_t *s, // Gelss flavor
+      int *iwork // Gelsd flavor
+      ) {
+    lapackGelss<scalar_t, value_t>(
+        m, n, nrhs,
+        a, lda, b, ldb,
+        s, rcond, rank,
+        work, lwork,
+        rwork, info);
+  }
+};
+template <LapackLstsqDriverType driver_type, class scalar_t, class value_t = scalar_t>
+void lapackLstsq(
+    char trans, int m, int n, int nrhs,
+    scalar_t *a, int lda, scalar_t *b, int ldb,
+    scalar_t *work, int lwork, int *info, // Gels flavor
+    int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor
+    value_t *s, // Gelss flavor
+    int *iwork // Gelsd flavor
+    ) {
+  lapackLstsq_impl<driver_type, scalar_t, value_t>::call(
+      trans, m, n, nrhs,
+      a, lda, b, ldb,
+      work, lwork, info,
+      jpvt, rcond, rank, rwork,
+      s,
+      iwork);
+}
+template <class scalar_t>
+void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info);
+template <class scalar_t>
+void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info);
+template <class scalar_t>
+void lapackLdlHermitian(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+template <class scalar_t>
+void lapackLdlSymmetric(
+    char uplo,
+    int n,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* work,
+    int lwork,
+    int* info);
+template <class scalar_t>
+void lapackLdlSolveHermitian(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+template <class scalar_t>
+void lapackLdlSolveSymmetric(
+    char uplo,
+    int n,
+    int nrhs,
+    scalar_t* a,
+    int lda,
+    int* ipiv,
+    scalar_t* b,
+    int ldb,
+    int* info);
+template<class scalar_t, class value_t=scalar_t>
+void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info);
+#endif
+#if AT_BUILD_WITH_BLAS()
+template <class scalar_t>
+void blasTriangularSolve(char side, char uplo, char trans, char diag, int n, int nrhs, scalar_t* a, int lda, scalar_t* b, int ldb);
+#endif
+using cholesky_fn = void (*)(const Tensor& /*input*/, const Tensor& /*info*/, bool /*upper*/);
+DECLARE_DISPATCH(cholesky_fn, cholesky_stub);
+using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, bool /*upper*/);
+DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub);
+using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/);
+DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub);
+using geqrf_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(geqrf_fn, geqrf_stub);
+using orgqr_fn = Tensor& (*)(Tensor& /*result*/, const Tensor& /*tau*/);
+DECLARE_DISPATCH(orgqr_fn, orgqr_stub);
+using ormqr_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/, const Tensor& /*other*/, bool /*left*/, bool /*transpose*/);
+DECLARE_DISPATCH(ormqr_fn, ormqr_stub);
+using linalg_eigh_fn = void (*)(
+    const Tensor& /*eigenvalues*/,
+    const Tensor& /*eigenvectors*/,
+    const Tensor& /*infos*/,
+    bool /*upper*/,
+    bool /*compute_eigenvectors*/);
+DECLARE_DISPATCH(linalg_eigh_fn, linalg_eigh_stub);
+using lstsq_fn = void (*)(
+    const Tensor& /*a*/,
+    Tensor& /*b*/,
+    Tensor& /*rank*/,
+    Tensor& /*singular_values*/,
+    Tensor& /*infos*/,
+    double /*rcond*/,
+    std::string /*driver_name*/);
+DECLARE_DISPATCH(lstsq_fn, lstsq_stub);
+using triangular_solve_fn = void (*)(
+    const Tensor& /*A*/,
+    const Tensor& /*B*/,
+    bool /*left*/,
+    bool /*upper*/,
+    TransposeType /*transpose*/,
+    bool /*unitriangular*/);
+DECLARE_DISPATCH(triangular_solve_fn, triangular_solve_stub);
+using lu_factor_fn = void (*)(
+    const Tensor& /*input*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*infos*/,
+    bool /*compute_pivots*/);
+DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub);
+using unpack_pivots_fn = void(*)(
+  TensorIterator& iter,
+  const int64_t dim_size,
+  const int64_t max_pivot);
+DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub);
+using lu_solve_fn = void (*)(
+    const Tensor& /*LU*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*B*/,
+    TransposeType /*trans*/);
+DECLARE_DISPATCH(lu_solve_fn, lu_solve_stub);
+using ldl_factor_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*info*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub);
+using svd_fn = void (*)(
+    const Tensor& /*A*/,
+    const bool /*full_matrices*/,
+    const bool /*compute_uv*/,
+    const std::optional<c10::string_view>& /*driver*/,
+    const Tensor& /*U*/,
+    const Tensor& /*S*/,
+    const Tensor& /*Vh*/,
+    const Tensor& /*info*/);
+DECLARE_DISPATCH(svd_fn, svd_stub);
+using ldl_solve_fn = void (*)(
+    const Tensor& /*LD*/,
+    const Tensor& /*pivots*/,
+    const Tensor& /*result*/,
+    bool /*upper*/,
+    bool /*hermitian*/);
+DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub);
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/BinaryOps.h ADDED Viewed

	@@ -0,0 +1,119 @@

+#pragma once
+#include <ATen/core/TensorBase.h>
+#include <ATen/native/DispatchStub.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/TypeSafeSignMath.h>
+namespace at {
+struct TensorIterator;
+struct TensorIteratorBase;
+}
+namespace at::native {
+inline void alpha_check(const ScalarType dtype, const Scalar& alpha) {
+  TORCH_CHECK(! alpha.isBoolean() || dtype == ScalarType::Bool,
+              "Boolean alpha only supported for Boolean results.");
+  TORCH_CHECK(isFloatingType(dtype) || isComplexType(dtype)
+              || alpha.isIntegral(true),
+              "For integral input tensors, argument alpha must not be a floating point number.");
+  TORCH_CHECK(isComplexType(dtype) || !alpha.isComplex(),
+              "For non-complex input tensors, argument alpha must not be a complex number.")
+}
+// Basic checking for all sub functions.
+inline void sub_check(const TensorBase& self, const TensorBase& other) {
+  TORCH_CHECK(self.scalar_type() != kBool || other.scalar_type() != kBool,
+              "Subtraction, the `-` operator, with two bool tensors is not supported. "
+              "Use the `^` or `logical_xor()` operator instead.")
+  TORCH_CHECK(self.scalar_type() != kBool && other.scalar_type() != kBool,
+              "Subtraction, the `-` operator, with a bool tensor is not supported. "
+              "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+}
+inline void sub_check(const TensorBase& self, const Scalar& scalar) {
+  TORCH_CHECK(self.scalar_type() != kBool || !scalar.isBoolean(),
+              "Subtraction, the `-` operator, with two bool tensors is not supported. "
+              "Use the `^` or `logical_xor()` operator instead.")
+  TORCH_CHECK(self.scalar_type() != kBool && !scalar.isBoolean(),
+              "Subtraction, the `-` operator, with a bool tensor is not supported. "
+              "If you are trying to invert a mask, use the `~` or `logical_not()` operator instead.");
+}
+using structured_binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
+using structured_binary_fn_double = void(*)(TensorIteratorBase&, double);
+using structured_binary_fn = void(*)(TensorIteratorBase&);
+using binary_fn_alpha = void(*)(TensorIteratorBase&, const Scalar& alpha);
+using binary_fn_double = void(*)(TensorIterator&, double);
+using binary_fn = void(*)(TensorIterator&);
+using binary_clamp_fn_alpha =
+    void(*)(TensorIterator&, const Scalar& alpha, const Scalar& min_val, const Scalar& max_val);
+// NB: codegenned
+DECLARE_DISPATCH(structured_binary_fn_alpha, add_stub);
+DECLARE_DISPATCH(binary_clamp_fn_alpha, add_clamp_stub);
+DECLARE_DISPATCH(structured_binary_fn_alpha, sub_stub);
+DECLARE_DISPATCH(structured_binary_fn, mul_stub);
+DECLARE_DISPATCH(structured_binary_fn, div_true_stub);
+DECLARE_DISPATCH(structured_binary_fn, div_floor_stub);
+DECLARE_DISPATCH(structured_binary_fn, div_trunc_stub);
+DECLARE_DISPATCH(structured_binary_fn, atan2_stub);
+DECLARE_DISPATCH(structured_binary_fn, remainder_stub);
+DECLARE_DISPATCH(structured_binary_fn, bitwise_and_stub);
+DECLARE_DISPATCH(structured_binary_fn, bitwise_or_stub);
+DECLARE_DISPATCH(structured_binary_fn, bitwise_xor_stub);
+DECLARE_DISPATCH(structured_binary_fn, lshift_stub);
+DECLARE_DISPATCH(structured_binary_fn, rshift_stub);
+DECLARE_DISPATCH(binary_fn, logical_xor_stub);
+DECLARE_DISPATCH(binary_fn, logical_and_stub);
+DECLARE_DISPATCH(binary_fn, logical_or_stub);
+DECLARE_DISPATCH(structured_binary_fn, lt_stub);
+DECLARE_DISPATCH(structured_binary_fn, le_stub);
+DECLARE_DISPATCH(structured_binary_fn, gt_stub);
+DECLARE_DISPATCH(structured_binary_fn, ge_stub);
+DECLARE_DISPATCH(structured_binary_fn, eq_stub);
+DECLARE_DISPATCH(structured_binary_fn, ne_stub);
+DECLARE_DISPATCH(binary_fn, max_elementwise_stub);
+DECLARE_DISPATCH(binary_fn, min_elementwise_stub);
+DECLARE_DISPATCH(structured_binary_fn, maximum_stub);
+DECLARE_DISPATCH(structured_binary_fn, minimum_stub);
+DECLARE_DISPATCH(structured_binary_fn, fmax_stub);
+DECLARE_DISPATCH(structured_binary_fn, fmin_stub);
+DECLARE_DISPATCH(structured_binary_fn_double, smooth_l1_stub);
+DECLARE_DISPATCH(binary_fn_double, huber_stub);
+DECLARE_DISPATCH(structured_binary_fn, sigmoid_backward_stub);
+DECLARE_DISPATCH(binary_fn_alpha, logit_backward_stub);
+DECLARE_DISPATCH(structured_binary_fn, tanh_backward_stub);
+DECLARE_DISPATCH(structured_binary_fn, mse_stub);
+DECLARE_DISPATCH(structured_binary_fn, fmod_stub);
+DECLARE_DISPATCH(structured_binary_fn, logaddexp_stub);
+DECLARE_DISPATCH(structured_binary_fn, logaddexp2_stub);
+DECLARE_DISPATCH(structured_binary_fn, gcd_stub);
+DECLARE_DISPATCH(structured_binary_fn, lcm_stub);
+DECLARE_DISPATCH(structured_binary_fn, hypot_stub);
+DECLARE_DISPATCH(structured_binary_fn, igamma_stub);
+DECLARE_DISPATCH(structured_binary_fn, igammac_stub);
+DECLARE_DISPATCH(structured_binary_fn, nextafter_stub);
+DECLARE_DISPATCH(structured_binary_fn, heaviside_stub);
+DECLARE_DISPATCH(structured_binary_fn, copysign_stub);
+DECLARE_DISPATCH(structured_binary_fn, xlogy_stub);
+DECLARE_DISPATCH(structured_binary_fn, xlog1py_stub);
+DECLARE_DISPATCH(structured_binary_fn, zeta_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_t_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_u_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_v_stub);
+DECLARE_DISPATCH(structured_binary_fn, chebyshev_polynomial_w_stub);
+DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_h_stub);
+DECLARE_DISPATCH(structured_binary_fn, hermite_polynomial_he_stub);
+DECLARE_DISPATCH(structured_binary_fn, laguerre_polynomial_l_stub);
+DECLARE_DISPATCH(structured_binary_fn, legendre_polynomial_p_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_t_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_u_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_v_stub);
+DECLARE_DISPATCH(structured_binary_fn, shifted_chebyshev_polynomial_w_stub);
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/ComplexHelper.h ADDED Viewed

	@@ -0,0 +1,97 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/view_as_real_native.h>
+#include <ATen/ops/view_as_complex_native.h>
+#include <utility>
+#endif
+// WARNING: this header contains non-inline functions and should be only
+// included from ONE cpp file
+namespace at::native {
+// View tensor with new dtype, storage offset, sizes and strides
+inline Tensor view_tensor(
+    const Tensor &tensor, ScalarType dtype,
+    c10::SymInt offset, SymIntArrayRef sizes, SymIntArrayRef strides) {
+  Storage storage = tensor.storage();
+  auto key_set = tensor.key_set().remove(DispatchKey::Conjugate);
+  auto new_tensor = detail::make_tensor<TensorImpl>(
+      c10::TensorImpl::VIEW, std::move(storage), key_set, scalarTypeToTypeMeta(dtype));
+  auto * impl = new_tensor.unsafeGetTensorImpl();
+  impl->set_sizes_and_strides(sizes, strides, offset);
+  return new_tensor;
+}
+inline SymDimVector computeStrideForViewAsReal(SymIntArrayRef oldstride) {
+  SymDimVector res(oldstride.size() + 1);
+  for (const auto i : c10::irange(oldstride.size())) {
+    res[i] = oldstride[i] * 2;
+  }
+  res.back() = 1;
+  return res;
+}
+inline Tensor _view_as_real_physical(const Tensor& self) {
+  TORCH_CHECK(self.is_complex(), "view_as_real is only supported for complex tensors");
+  auto old_sizes = self.sym_sizes();
+  SymDimVector new_sizes(old_sizes.size() + 1);
+  std::copy(old_sizes.begin(), old_sizes.end(), new_sizes.begin());
+  // last dimension will always have two elements containing the real and imag vals
+  new_sizes.back() = 2;
+  auto new_strides = computeStrideForViewAsReal(self.sym_strides());
+  auto new_storage_offset = self.sym_storage_offset() * 2;
+  const auto float_type = c10::toRealValueType(self.scalar_type());
+  auto real_tensor = view_tensor(self, float_type, std::move(new_storage_offset), new_sizes, new_strides);
+  return real_tensor;
+}
+// expects as input a complex tensor and returns back a tensor
+// with corresponding real dtype containing the complex values
+// in the last two dimensions
+Tensor view_as_real(const Tensor& self) {
+  TORCH_CHECK(!self.is_conj(), "view_as_real doesn't work on unresolved conjugated tensors.  To resolve the conjugate tensor so you can view it as real, use self.resolve_conj(); however, be warned that the resulting tensor will NOT alias the original.");
+  return _view_as_real_physical(self);
+}
+inline SymDimVector computeStrideForViewAsComplex(SymIntArrayRef oldstride) {
+  const auto dim = oldstride.size();
+  TORCH_CHECK(dim > 0 && oldstride[dim - 1] == 1, "Tensor must have a last dimension with stride 1");
+  SymDimVector res(dim - 1);
+  for (const auto i : c10::irange(res.size())) {
+    TORCH_CHECK(oldstride[i] % 2 == 0, "Tensor must have a stride divisible by 2 for all but last dimension");
+    res[i] = oldstride[i] / 2;
+  }
+  return res;
+}
+// expects as input a float or double tensor with last dimension of size 2
+// and returns back a tensor with corresponding complex dtype
+Tensor view_as_complex(const Tensor& self) {
+  TORCH_CHECK(
+    self.scalar_type() == kFloat || self.scalar_type() == kDouble || self.scalar_type() == kHalf,
+    "view_as_complex is only supported for half, float and double tensors, but got a tensor of scalar type: ", self.scalar_type());
+  auto old_sizes = self.sym_sizes();
+  TORCH_CHECK(!old_sizes.empty(), "Input tensor must have one or more dimensions");
+  TORCH_CHECK(old_sizes[old_sizes.size()-1] == 2, "Tensor must have a last dimension of size 2");
+  SymDimVector new_sizes(old_sizes.begin(), old_sizes.end() - 1);
+  const auto new_strides = computeStrideForViewAsComplex(self.sym_strides());
+  const auto complex_type = c10::toComplexType(self.scalar_type());
+  TORCH_CHECK(self.sym_storage_offset() % 2 == 0, "Tensor must have a storage_offset divisible by 2");
+  const auto new_storage_offset = self.sym_storage_offset() / 2;
+  return view_tensor(self, complex_type, new_storage_offset, new_sizes, new_strides);
+}
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distance.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace at {
+class Tensor;
+namespace native {
+using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p);
+using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&);
+using cdist_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p);
+using cdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&);
+DECLARE_DISPATCH(pdist_forward_fn, pdist_forward_stub);
+DECLARE_DISPATCH(pdist_backward_fn, pdist_backward_stub);
+DECLARE_DISPATCH(cdist_fn, cdist_stub);
+DECLARE_DISPATCH(cdist_backward_fn, cdist_backward_stub);
+}} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h ADDED Viewed

	@@ -0,0 +1,21 @@

+// Functions that fill Tensors with constants. Implementations are in Fill.cpp.
+#pragma once
+#include <ATen/native/DispatchStub.h>
+namespace c10 {
+class Scalar;
+}
+namespace at {
+class Tensor;
+struct TensorIterator;
+namespace native {
+DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub);
+Tensor& fill_out(Tensor& self, const Scalar& value);
+}} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/FractionalMaxPooling.h ADDED Viewed

	@@ -0,0 +1,80 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/TensorUtils.h>
+#include <c10/util/irange.h>
+namespace at::native {
+template<typename scalar_t>
+inline std::vector<int64_t> generate_intervals(
+    scalar_t sample,
+    int64_t inputSize,
+    int64_t outputSize,
+    int64_t poolSize) {
+  std::vector<int64_t> sequence(outputSize);
+  if (outputSize > 1) {
+    scalar_t alpha = static_cast<scalar_t>(inputSize - poolSize) /
+      static_cast<scalar_t>(outputSize - 1);
+    for (const auto i : c10::irange(outputSize - 1)) {
+      sequence[i] =
+        static_cast<int>((i + sample) * alpha) - static_cast<int>(sample * alpha);
+    }
+  }
+  if (outputSize > 0) {
+    sequence[outputSize - 1] = inputSize - poolSize;
+  }
+  return sequence;
+}
+template <int64_t ndim>
+inline void fractional_max_pool_check_shape(
+    const Tensor& input,
+    const Tensor& randomSamples) {
+  TORCH_CHECK(
+      input.scalar_type() == randomSamples.scalar_type(),
+      "Expect _random_samples to have the same dtype as input");
+  int64_t ndimension = randomSamples.ndimension();
+  TORCH_CHECK(
+      ndimension == 3,
+      "Expect _random_samples to have 3 dimensions, got ", ndimension);
+  int64_t N = randomSamples.size(0);
+  int64_t C = randomSamples.size(1);
+  int64_t D = randomSamples.size(2);
+  int64_t input_batch = 0, input_channel = 0;
+  if (ndim == 2) {
+    // fractional_max_pool2d
+    if (input.ndimension() == 3) {
+      input_batch = 1;
+      input_channel = input.size(0);
+    } else {
+      input_batch = input.size(0);
+      input_channel = input.size(1);
+    }
+  } else {
+    // factional_max_pool3d
+    if (input.ndimension() == 4) {
+      input_batch = 1;
+      input_channel = input.size(0);
+    } else {
+      input_batch = input.size(0);
+      input_channel = input.size(1);
+    }
+  }
+  TORCH_CHECK(
+      N >= input_batch,
+      "Expect _random_samples.size(0) no less then input batch size.");
+  TORCH_CHECK(
+      C == input_channel,
+      "Expect _random_samples.size(1) equals to input channel size.");
+  TORCH_CHECK(
+      D == ndim,
+      "Expect _random_samples.size(2) equals to ", ndim, "; got ", D, ".");
+}
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+#include <ATen/native/DispatchStub.h>
+#include <cstdint>
+namespace at {
+struct TensorIterator;
+namespace native {
+using _compute_linear_combination_fn = void(*)(
+  TensorIterator& iter,
+  int64_t in_stride,
+  int64_t coeff_stride,
+  int64_t num_summations
+);
+DECLARE_DISPATCH(_compute_linear_combination_fn, _compute_linear_combination_stub);
+}} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSampler.h ADDED Viewed

	@@ -0,0 +1,298 @@

+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <utility>
+#include <ATen/native/GridSamplerUtils.h>
+namespace at::native {
+using detail::GridSamplerInterpolation;
+using detail::GridSamplerPadding;
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
+// except that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int64_t size,
+                                                         bool align_corners, scalar_t *grad_in) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    *grad_in = static_cast<scalar_t>(size - 1) / 2;
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    *grad_in = static_cast<scalar_t>(size) / 2;
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+// Clips coordinates to between 0 and clip_limit - 1
+template<typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+}
+// clip_coordinates_set_grad works similarly to clip_coordinates except that
+// it also returns the `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template<typename scalar_t>
+static inline scalar_t clip_coordinates_set_grad(scalar_t in, int64_t clip_limit,
+                                                 scalar_t *grad_in) {
+  // Note that it is important for the gradient calculation that borders
+  // are considered out of bounds.
+  if (in <= static_cast<scalar_t>(0)) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  } else {
+    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<scalar_t>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<scalar_t>(1);
+      return in;
+    }
+  }
+}
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template<typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+// reflect_coordinates_set_grad works similarly to reflect_coordinates except
+// that it also returns the `d output / d input` via pointer argument
+// `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template<typename scalar_t>
+static inline scalar_t reflect_coordinates_set_grad(scalar_t in, int64_t twice_low,
+                                                    int64_t twice_high, scalar_t *grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<scalar_t>(0);
+    return static_cast<scalar_t>(0);
+  }
+  int grad_in_mult_;
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<scalar_t>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  // `fmod` returns same sign as `in`, which is positive after the `if` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<scalar_t>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+// Mapping the out-of-boundary points back into boundary
+// This would only affect padding_mode=border or reflection
+template<typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           GridSamplerPadding padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2*(size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2*size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(
+    scalar_t coord,
+    int64_t size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+// grid_sampler_compute_source_index_set_grad works similarly to
+// grid_sampler_compute_source_index except that it also returns the
+// `d output / d input` via pointer argument `grad_in`.
+// This is useful in the backward pass of grid_sampler.
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index_set_grad(
+    scalar_t coord,
+    int64_t size,
+    GridSamplerPadding padding_mode,
+    bool align_corners,
+    scalar_t *grad_in) {
+  scalar_t grad_clip, grad_refl;
+  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
+    } else {
+      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  return coord;
+}
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+template<typename scalar_t>
+static inline scalar_t get_value_bounded(
+    const scalar_t* data,
+    scalar_t x,
+    scalar_t y,
+    int64_t W,
+    int64_t H,
+    int64_t sW,
+    int64_t sH,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+template<typename scalar_t>
+static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w,
+                               int64_t sH, int64_t sW, int64_t H, int64_t W,
+                               scalar_t delta) {
+  if (within_bounds_2d(h, w, H, W)) {
+    data[h * sH + w * sW] += delta;
+  }
+}
+template<typename scalar_t>
+static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w,
+                               int64_t sD, int64_t sH, int64_t sW,
+                               int64_t D, int64_t H, int64_t W,
+                               scalar_t delta) {
+  if (within_bounds_3d(d, h, w, D, H, W)) {
+    data[d * sD + h * sH + w * sW] += delta;
+  }
+}
+template<typename scalar_t>
+static inline void add_value_bounded(
+    scalar_t* data,
+    scalar_t x,
+    scalar_t y,
+    int64_t W,
+    int64_t H,
+    int64_t sW,
+    int64_t sH,
+    scalar_t delta,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+  safe_add_2d(data, iy, ix, sH, sW, H, W, delta);
+}
+// Calculate the differential of the cubic convolution, i.e. `d coeff / d x`
+template<typename scalar_t>
+static inline void get_cubic_coefficients_grad(
+    scalar_t coeffs[4],
+    scalar_t t) {
+  // Must be the same as forward calculation in
+  // aten/src/ATen/native/UpSample.h:get_cubic_upsample_coefficients
+  scalar_t A = -0.75;
+  scalar_t x;
+  x = -1 - t; // 1 < x = |-1 - tx| < 2
+  coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A;
+  x = -t;     // x = |0 - tx| <= 1
+  coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 1 - t;  // x = |1 - tx| <= 1
+  coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x;
+  x = 2 - t;  // 1 < x = |2 - tx| < 2
+  coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A;
+}
+}  // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h ADDED Viewed

	@@ -0,0 +1,69 @@

+#pragma once
+#include <ATen/core/Tensor.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/Dispatch.h>
+#include <ATen/TensorUtils.h>
+namespace at::native {
+  inline void multilabel_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+      TORCH_CHECK(
+          target.dim() <= 1 && target.numel() == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+      TORCH_CHECK(
+          target.dim() == 2 && target.size(0) == nframe &&
+          target.size(1) == dim,
+          "inconsistent target size: ", target.sizes(), " for input of size: ",
+          input.sizes());
+    }
+  }
+  inline void multi_margin_loss_shape_check(
+    int64_t& nframe,
+    int64_t& dim,
+    const int64_t& ndims,
+    const Tensor& input,
+    const Tensor& target,
+    const std::optional<Tensor>& weight) {
+    TORCH_CHECK(
+        (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0,
+        "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ",
+        input.sizes());
+    if (ndims <= 1) {
+      nframe = 1;
+      dim = ndims == 0 ? 1 : input.size(0);
+    } else {
+      nframe = input.size(0);
+      dim = input.size(1);
+    }
+    TORCH_CHECK(
+        target.dim() <= 1 && target.numel() == nframe,
+        "inconsistent target size, expected ", nframe, " but got ",
+        target.sizes());
+    if (weight && weight->defined()) {
+      TORCH_CHECK(
+          weight->dim() <= 1 && weight->numel() == dim,
+          "inconsistent weight size, expected ", dim, " but got ",
+          weight->sizes());
+    }
+}
+} // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/Math.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h ADDED Viewed

	@@ -0,0 +1,71 @@

+#pragma once
+namespace at {
+// views and their in-place version ops
+#define TORCH_VIEW_FNS(m) \
+  m.impl("as_strided_", torch::CppFunction::makeFallthrough()); \
+  m.impl("detach", torch::CppFunction::makeFallthrough()); \
+  m.impl("detach_", torch::CppFunction::makeFallthrough()); \
+  m.impl("diagonal", torch::CppFunction::makeFallthrough()); \
+  m.impl("expand", torch::CppFunction::makeFallthrough()); \
+  m.impl("expand_as", torch::CppFunction::makeFallthrough()); \
+  m.impl("movedim.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("movedim.intlist", torch::CppFunction::makeFallthrough()); \
+  m.impl("narrow", torch::CppFunction::makeFallthrough()); \
+  m.impl("permute", torch::CppFunction::makeFallthrough()); \
+  m.impl("select.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("select.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("squeeze", torch::CppFunction::makeFallthrough()); \
+  m.impl("squeeze_", torch::CppFunction::makeFallthrough()); \
+  m.impl("transpose.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("transpose.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("transpose_", torch::CppFunction::makeFallthrough()); \
+  m.impl("t", torch::CppFunction::makeFallthrough()); \
+  m.impl("t_", torch::CppFunction::makeFallthrough()); \
+  m.impl("real", torch::CppFunction::makeFallthrough()); \
+  m.impl("imag", torch::CppFunction::makeFallthrough()); \
+  m.impl("view_as_real", torch::CppFunction::makeFallthrough()); \
+  m.impl("unflatten.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("unflatten.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("unfold", torch::CppFunction::makeFallthrough()); \
+  m.impl("unsqueeze", torch::CppFunction::makeFallthrough()); \
+  m.impl("unsqueeze_", torch::CppFunction::makeFallthrough()); \
+  m.impl("view_as", torch::CppFunction::makeFallthrough()); \
+  m.impl("unbind.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("unbind.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("split.Tensor", torch::CppFunction::makeFallthrough()); \
+  m.impl("split_with_sizes", torch::CppFunction::makeFallthrough()); \
+  m.impl("swapaxes", torch::CppFunction::makeFallthrough()); \
+  m.impl("swapdims", torch::CppFunction::makeFallthrough()); \
+  m.impl("chunk", torch::CppFunction::makeFallthrough()); \
+  m.impl("reshape", torch::CppFunction::makeFallthrough()); \
+  m.impl("alias", torch::CppFunction::makeFallthrough()); \
+  m.impl("hsplit.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("hsplit.array", torch::CppFunction::makeFallthrough()); \
+  m.impl("dsplit.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("dsplit.array", torch::CppFunction::makeFallthrough()); \
+  m.impl("vsplit.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("vsplit.array", torch::CppFunction::makeFallthrough()); \
+  m.impl("conj", torch::CppFunction::makeFallthrough()); \
+  m.impl("_conj", torch::CppFunction::makeFallthrough()); \
+  m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); \
+  m.impl("resize_", torch::CppFunction::makeFallthrough());
+#define TENSOR_UTILITIES_AND_CONSTRUCTORS(m) \
+  m.impl("empty_like", torch::CppFunction::makeFallthrough()); \
+  m.impl("empty.memory_format", torch::CppFunction::makeFallthrough()); \
+  m.impl("empty.out", torch::CppFunction::makeFallthrough()); \
+  m.impl("empty_strided", torch::CppFunction::makeFallthrough()); \
+  m.impl("full_like", torch::CppFunction::makeFallthrough()); \
+  m.impl("stride.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("stride.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("size.int", torch::CppFunction::makeFallthrough()); \
+  m.impl("size.Dimname", torch::CppFunction::makeFallthrough()); \
+  m.impl("is_complex", torch::CppFunction::makeFallthrough()); \
+  m.impl("is_floating_point", torch::CppFunction::makeFallthrough()); \
+  m.impl("requires_grad_", torch::CppFunction::makeFallthrough());
+}
+#define TORCH_VIEW_FNS_NATIVE_FN_REGISTRATION(m) \
+  m.impl("as_strided", torch::CppFunction::makeFallthrough()); \
+  m.impl("view", torch::CppFunction::makeFallthrough());

.venv/lib/python3.11/site-packages/torch/include/ATen/native/NonEmptyUtils.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#include <ATen/core/TensorBase.h>
+#include <algorithm>
+#include <vector>
+namespace at::native {
+inline int64_t ensure_nonempty_dim(int64_t dim) {
+  return std::max<int64_t>(dim, 1);
+}
+inline int64_t ensure_nonempty_size(const TensorBase &t, int64_t dim) {
+  return t.dim() == 0 ? 1 : t.size(dim);
+}
+inline int64_t ensure_nonempty_stride(const TensorBase &t, int64_t dim) {
+  return t.dim() == 0 ? 1 : t.stride(dim);
+}
+using IdxVec = std::vector<int64_t>;
+inline IdxVec ensure_nonempty_vec(IdxVec vec) {
+  if (vec.empty()) {
+    vec.push_back(1);
+  }
+  return vec;
+}
+}  // namespace at::native

.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h ADDED Viewed

	@@ -0,0 +1,19 @@

+#pragma once
+#include <ATen/TensorIterator.h>
+#include <ATen/native/DispatchStub.h>
+namespace at::native {
+using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm);
+DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub);
+enum class BatchNormBackend {
+  Native,
+  Cudnn,
+  Miopen,
+};
+TORCH_API BatchNormBackend _select_batch_norm_backend(const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double eps);
+}  // namespace at::native