File size: 4,344 Bytes
e36aeda | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | // Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// The method is based on a paper by Naoki Shibata: "Efficient evaluation
// methods of elementary functions suitable for SIMD computation", Proc.
// of International Supercomputing Conference 2010 (ISC'10), pp. 25 -- 32
// (May 2010). The paper is available at
// https://link.springer.com/article/10.1007/s00450-010-0108-2
//
// The original code and the constants below are from the author's
// implementation available at http://freshmeat.net/projects/sleef.
// The README file says, "The software is in public domain.
// You can use the software without any obligation."
//
// This code is a simplified version of the original.
#define LN2 0.6931471805599453094172321214581766 // log_e(2)
#define LOG2E 1.4426950408889634073599246810018920 // 1/LN2
#define LN2U 0.69314718055966295651160180568695068359375 // upper half LN2
#define LN2L 0.28235290563031577122588448175013436025525412068e-12 // lower half LN2
#define PosInf 0x7FF0000000000000
#define NegInf 0xFFF0000000000000
#define Overflow 7.09782712893384e+02
DATA exprodata<>+0(SB)/8, $0.5
DATA exprodata<>+8(SB)/8, $1.0
DATA exprodata<>+16(SB)/8, $2.0
DATA exprodata<>+24(SB)/8, $1.6666666666666666667e-1
DATA exprodata<>+32(SB)/8, $4.1666666666666666667e-2
DATA exprodata<>+40(SB)/8, $8.3333333333333333333e-3
DATA exprodata<>+48(SB)/8, $1.3888888888888888889e-3
DATA exprodata<>+56(SB)/8, $1.9841269841269841270e-4
DATA exprodata<>+64(SB)/8, $2.4801587301587301587e-5
GLOBL exprodata<>+0(SB), RODATA, $72
// func Exp(x float64) float64
TEXT ·archExp(SB),NOSPLIT,$0
// test bits for not-finite
MOVQ x+0(FP), BX
MOVQ $~(1<<63), AX // sign bit mask
MOVQ BX, DX
ANDQ AX, DX
MOVQ $PosInf, AX
CMPQ AX, DX
JLE notFinite
// check if argument will overflow
MOVQ BX, X0
MOVSD $Overflow, X1
COMISD X1, X0
JA overflow
MOVSD $LOG2E, X1
MULSD X0, X1
CVTSD2SL X1, BX // BX = exponent
CVTSL2SD BX, X1
CMPB ·useFMA(SB), $1
JE avxfma
MOVSD $LN2U, X2
MULSD X1, X2
SUBSD X2, X0
MOVSD $LN2L, X2
MULSD X1, X2
SUBSD X2, X0
// reduce argument
MULSD $0.0625, X0
// Taylor series evaluation
MOVSD exprodata<>+64(SB), X1
MULSD X0, X1
ADDSD exprodata<>+56(SB), X1
MULSD X0, X1
ADDSD exprodata<>+48(SB), X1
MULSD X0, X1
ADDSD exprodata<>+40(SB), X1
MULSD X0, X1
ADDSD exprodata<>+32(SB), X1
MULSD X0, X1
ADDSD exprodata<>+24(SB), X1
MULSD X0, X1
ADDSD exprodata<>+0(SB), X1
MULSD X0, X1
ADDSD exprodata<>+8(SB), X1
MULSD X1, X0
MOVSD exprodata<>+16(SB), X1
ADDSD X0, X1
MULSD X1, X0
MOVSD exprodata<>+16(SB), X1
ADDSD X0, X1
MULSD X1, X0
MOVSD exprodata<>+16(SB), X1
ADDSD X0, X1
MULSD X1, X0
MOVSD exprodata<>+16(SB), X1
ADDSD X0, X1
MULSD X1, X0
ADDSD exprodata<>+8(SB), X0
// return fr * 2**exponent
ldexp:
ADDL $0x3FF, BX // add bias
JLE denormal
CMPL BX, $0x7FF
JGE overflow
lastStep:
SHLQ $52, BX
MOVQ BX, X1
MULSD X1, X0
MOVSD X0, ret+8(FP)
RET
notFinite:
// test bits for -Inf
MOVQ $NegInf, AX
CMPQ AX, BX
JNE notNegInf
// -Inf, return 0
underflow: // return 0
MOVQ $0, ret+8(FP)
RET
overflow: // return +Inf
MOVQ $PosInf, BX
notNegInf: // NaN or +Inf, return x
MOVQ BX, ret+8(FP)
RET
denormal:
CMPL BX, $-52
JL underflow
ADDL $0x3FE, BX // add bias - 1
SHLQ $52, BX
MOVQ BX, X1
MULSD X1, X0
MOVQ $1, BX
JMP lastStep
avxfma:
MOVSD $LN2U, X2
VFNMADD231SD X2, X1, X0
MOVSD $LN2L, X2
VFNMADD231SD X2, X1, X0
// reduce argument
MULSD $0.0625, X0
// Taylor series evaluation
MOVSD exprodata<>+64(SB), X1
VFMADD213SD exprodata<>+56(SB), X0, X1
VFMADD213SD exprodata<>+48(SB), X0, X1
VFMADD213SD exprodata<>+40(SB), X0, X1
VFMADD213SD exprodata<>+32(SB), X0, X1
VFMADD213SD exprodata<>+24(SB), X0, X1
VFMADD213SD exprodata<>+0(SB), X0, X1
VFMADD213SD exprodata<>+8(SB), X0, X1
MULSD X1, X0
VADDSD exprodata<>+16(SB), X0, X1
MULSD X1, X0
VADDSD exprodata<>+16(SB), X0, X1
MULSD X1, X0
VADDSD exprodata<>+16(SB), X0, X1
MULSD X1, X0
VADDSD exprodata<>+16(SB), X0, X1
VFMADD213SD exprodata<>+8(SB), X1, X0
JMP ldexp
|