Add files using upload-large-folder tool

c1af2fa verified 3 months ago

4.14 kB

	#pragma once

	#include <assert.h>
	#if defined(__CUDA_ARCH__) \|\| defined(__HIP_DEVICE_COMPILE__)
	#include <cuda_runtime.h>
	#endif

	namespace at::cuda::detail {

	// A utility class to implement integer division by multiplication, given a fixed
	// divisor.
	//
	// WARNING: The fast divider algorithm is only implemented for unsigned int;
	// otherwise we default to plain integer division. For unsigned int,
	// we further assume that the dividend is at most INT32_MAX. Thus,
	// IntDivider must NOT be used for general integer division.
	//
	// This reduced range is enough for our purpose, and it allows us to
	// slightly simplify the computation.
	//
	// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1<<k.)
	//
	// For any N-bit unsigned integer d (> 0), we can find a "magic number" m (2^N
	// <= m < 2^(N+1)) and shift s such that:
	//
	// \floor(n / d) = \floor((m * n) / 2^(N+s)).
	//
	// Given such m and s, the integer division can be then implemented as:
	//
	// let m' = m - 2^N // 0 <= m' < 2^N
	//
	// fast_integer_division(n):
	// // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned
	// // integer. Then take the higher N bits.
	// t = (m' * n) >> N
	//
	// // Here we use the fact that n is less than 2^(N-1): otherwise the value
	// // of (t + n) may not fit in an N-bit integer.
	// return (t + n) >> s
	//
	// Finding such a magic number is surprisingly easy:
	//
	// s = \ceil(\log_2 d)
	// m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic.
	//
	// See also:
	// - Division by Invariant Integers Using Multiplication,
	// Torbjörn Granlund and Peter L. Montgomery, 1994.
	//
	// - http://www.hackersdelight.org/magic.htm
	//
	// - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html

	// Result of div/mod operation stored together.
	template <typename Value>
	struct DivMod {
	Value div, mod;

	C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { }
	};

	// Base case: we only have an implementation for uint32_t for now. For
	// everything else, we use plain division.
	template <typename Value>
	struct IntDivider {
	IntDivider() = default;
	IntDivider(Value d) : divisor(d) { }

	C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; }
	C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; }
	C10_HOST_DEVICE inline DivMod<Value> divmod(Value n) const {
	return DivMod<Value>(n / divisor, n % divisor);
	}

	Value divisor;
	};

	// Implement fast integer division.
	template <>
	struct IntDivider<unsigned int> {
	static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int.");

	IntDivider() = default;

	IntDivider(unsigned int d) : divisor(d) {
	assert(divisor >= 1 && divisor <= INT32_MAX);

	// TODO: gcc/clang has __builtin_clz() but it's not portable.
	for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break;

	uint64_t one = 1;
	uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
	m1 = magic;
	assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits.
	}

	C10_HOST_DEVICE inline unsigned int div(unsigned int n) const {
	#if defined(__CUDA_ARCH__) \|\| defined(__HIP_DEVICE_COMPILE__)
	// 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and
	// 'm1'.
	unsigned int t = __umulhi(n, m1);
	return (t + n) >> shift;
	#else
	// Using uint64_t so that the addition does not overflow.
	uint64_t t = ((uint64_t) n * m1) >> 32;
	return (t + n) >> shift;
	#endif
	}

	C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const {
	return n - div(n) * divisor;
	}

	C10_HOST_DEVICE inline DivMod<unsigned int> divmod(unsigned int n) const {
	unsigned int q = div(n);
	return DivMod<unsigned int>(q, n - q * divisor);
	}

	unsigned int divisor; // d above.
	unsigned int m1; // Magic number: m' above.
	unsigned int shift; // Shift amounts.
	};

	} // namespace at::cuda::detail