|
|
#pragma once
|
|
|
|
|
|
#include <c10/macros/Macros.h>
|
|
|
#include <c10/util/floating_point_utils.h>
|
|
|
#include <cstring>
|
|
|
#include <limits>
|
|
|
|
|
|
|
|
|
C10_CLANG_DIAGNOSTIC_PUSH()
|
|
|
#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion")
|
|
|
C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
|
|
|
#endif
|
|
|
|
|
|
namespace c10 {
|
|
|
|
|
|
|
|
|
|
|
|
inline C10_HOST_DEVICE Float8_e8m0fnu::Float8_e8m0fnu(float value)
|
|
|
: x(detail::fp8e8m0fnu_from_fp32_value(value)) {}
|
|
|
|
|
|
|
|
|
|
|
|
inline C10_HOST_DEVICE Float8_e8m0fnu::operator float() const {
|
|
|
|
|
|
|
|
|
|
|
|
if (x == 0) {
|
|
|
return c10::detail::fp32_from_bits(0x00400000);
|
|
|
}
|
|
|
|
|
|
|
|
|
if (isnan()) {
|
|
|
return c10::detail::fp32_from_bits(0x7f800001);
|
|
|
}
|
|
|
|
|
|
|
|
|
uint32_t res = x << 23;
|
|
|
|
|
|
return c10::detail::fp32_from_bits(res);
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inline C10_HOST_DEVICE bool Float8_e8m0fnu::isnan() const {
|
|
|
return x == 0b11111111;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
namespace std {
|
|
|
|
|
|
template <>
|
|
|
class numeric_limits<c10::Float8_e8m0fnu> {
|
|
|
public:
|
|
|
static constexpr bool is_specialized = true;
|
|
|
static constexpr bool is_signed = false;
|
|
|
static constexpr bool is_integer = false;
|
|
|
static constexpr bool is_exact = false;
|
|
|
static constexpr bool has_infinity = false;
|
|
|
static constexpr bool has_quiet_NaN = true;
|
|
|
static constexpr bool has_signaling_NaN = false;
|
|
|
static constexpr auto has_denorm = false;
|
|
|
static constexpr auto has_denorm_loss = false;
|
|
|
static constexpr auto round_style = numeric_limits<float>::round_style;
|
|
|
static constexpr bool is_iec559 = false;
|
|
|
static constexpr bool is_bounded = true;
|
|
|
static constexpr bool is_modulo = false;
|
|
|
static constexpr int digits = 1;
|
|
|
static constexpr int digits10 = 0;
|
|
|
static constexpr int max_digits10 = 1;
|
|
|
static constexpr int radix = 2;
|
|
|
static constexpr int min_exponent = -126;
|
|
|
static constexpr int min_exponent10 = -38;
|
|
|
static constexpr int max_exponent = 128;
|
|
|
static constexpr int max_exponent10 = 38;
|
|
|
static constexpr auto traps = numeric_limits<float>::traps;
|
|
|
static constexpr auto tinyness_before = false;
|
|
|
|
|
|
static constexpr c10::Float8_e8m0fnu min() {
|
|
|
|
|
|
return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
|
|
|
}
|
|
|
static constexpr c10::Float8_e8m0fnu lowest() {
|
|
|
|
|
|
return c10::Float8_e8m0fnu(0b00000000, c10::Float8_e8m0fnu::from_bits());
|
|
|
}
|
|
|
static constexpr c10::Float8_e8m0fnu max() {
|
|
|
|
|
|
return c10::Float8_e8m0fnu(0b11111110, c10::Float8_e8m0fnu::from_bits());
|
|
|
}
|
|
|
static constexpr c10::Float8_e8m0fnu epsilon() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return c10::Float8_e8m0fnu(0b01111111, c10::Float8_e8m0fnu::from_bits());
|
|
|
}
|
|
|
static constexpr c10::Float8_e8m0fnu round_error() {
|
|
|
|
|
|
return c10::Float8_e8m0fnu(0b01111110, c10::Float8_e8m0fnu::from_bits());
|
|
|
}
|
|
|
static constexpr c10::Float8_e8m0fnu quiet_NaN() {
|
|
|
return c10::Float8_e8m0fnu(0b11111111, c10::Float8_e8m0fnu::from_bits());
|
|
|
}
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
C10_CLANG_DIAGNOSTIC_POP()
|
|
|
|