| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | #include <tiny-cuda-nn/object.h> |
| |
|
| | #include <tiny-cuda-nn/common.h> |
| | #include <tiny-cuda-nn/common_device.h> |
| |
|
| | TCNN_NAMESPACE_BEGIN |
| |
|
| | template <typename T> |
| | __global__ void one_hot_batched_kernel(const uint32_t num_elements, const uint32_t width, const uint32_t one_hot_dim, T* out, float scale) { |
| | const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; |
| | if (i >= num_elements) return; |
| |
|
| | const uint32_t dim = i % width; |
| | out[i] = dim == one_hot_dim ? (T)scale : (T)0.0f; |
| | } |
| |
|
| | template <typename T> |
| | void one_hot_batched(cudaStream_t stream, const uint32_t num_elements, const uint32_t width, const uint32_t one_hot_dim, T* out, float scale) { |
| | linear_kernel(one_hot_batched_kernel<T>, 0, stream, num_elements, width, one_hot_dim, out, scale); |
| | } |
| |
|
| | template void one_hot_batched(cudaStream_t stream, const uint32_t num_elements, const uint32_t width, const uint32_t one_hot_dim, float* out, float scale); |
| | template void one_hot_batched(cudaStream_t stream, const uint32_t num_elements, const uint32_t width, const uint32_t one_hot_dim, __half* out, float scale); |
| |
|
| | template <typename T> |
| | void mult(cudaStream_t stream, const uint32_t num_elements, T* inout, float factor) { |
| | linear_kernel(mult_scalar_kernel<T>, 0, stream, num_elements, inout, factor); |
| | } |
| |
|
| | template void mult(cudaStream_t stream, const uint32_t num_elements, float* inout, float factor); |
| | template void mult(cudaStream_t stream, const uint32_t num_elements, __half* inout, float factor); |
| |
|
| | template <typename T> |
| | void trim_and_cast_from(cudaStream_t stream, const MatrixLayout layout, const uint32_t num_elements, const uint32_t input_width, const uint32_t output_width, const T* in, float* out) { |
| | if (layout == RM) { |
| | linear_kernel(cast_from<T>, 0, stream, num_elements, in, out); |
| | } else { |
| | linear_kernel(trim_and_cast<T>, 0, stream, num_elements, input_width, output_width, in, out); |
| | } |
| | } |
| |
|
| | template void trim_and_cast_from(cudaStream_t stream, const MatrixLayout layout, const uint32_t num_elements, const uint32_t input_width, const uint32_t output_width, const float* in, float* out); |
| | template void trim_and_cast_from(cudaStream_t stream, const MatrixLayout layout, const uint32_t num_elements, const uint32_t input_width, const uint32_t output_width, const __half* in, float* out); |
| |
|
| | TCNN_NAMESPACE_END |
| |
|