|
|
#include "vec.h" |
|
|
|
|
|
#include <cassert> |
|
|
|
|
|
|
|
|
ggml_fp16_t ggml_table_gelu_f16[1 << 16]; |
|
|
|
|
|
|
|
|
ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16]; |
|
|
|
|
|
void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) { |
|
|
assert(nrc == 1); |
|
|
GGML_UNUSED(nrc); |
|
|
GGML_UNUSED(bx); |
|
|
GGML_UNUSED(by); |
|
|
GGML_UNUSED(bs); |
|
|
|
|
|
#if defined(GGML_SIMD) |
|
|
float sumf = 0.0f; |
|
|
|
|
|
#if defined(__ARM_FEATURE_SVE) |
|
|
const int sve_register_length = ggml_cpu_get_sve_cnt() * 8; |
|
|
const int ggml_f32_epr = sve_register_length / 32; |
|
|
const int ggml_f32_step = 8 * ggml_f32_epr; |
|
|
|
|
|
const int np = (n & ~(ggml_f32_step - 1)); |
|
|
svfloat32_t sum1 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum2 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum3 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum4 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum5 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum6 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum7 = svdup_n_f32(0.0f); |
|
|
svfloat32_t sum8 = svdup_n_f32(0.0f); |
|
|
svfloat32_t ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8; |
|
|
svfloat32_t ay1,ay2,ay3,ay4,ay5,ay6,ay7,ay8; |
|
|
for (int i = 0; i < np; i += ggml_f32_step) { |
|
|
ax1 = GGML_F32_VEC_LOAD(x + i); |
|
|
ay1 = GGML_F32_VEC_LOAD(y + i); |
|
|
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); |
|
|
|
|
|
ax2 = GGML_F32_VEC_LOAD(x + i + 1*ggml_f32_epr); |
|
|
ay2 = GGML_F32_VEC_LOAD(y + i + 1*ggml_f32_epr); |
|
|
sum2 = GGML_F32_VEC_FMA(sum2, ax2, ay2); |
|
|
|
|
|
ax3 = GGML_F32_VEC_LOAD(x + i + 2*ggml_f32_epr); |
|
|
ay3 = GGML_F32_VEC_LOAD(y + i + 2*ggml_f32_epr); |
|
|
sum3 = GGML_F32_VEC_FMA(sum3, ax3, ay3); |
|
|
|
|
|
ax4 = GGML_F32_VEC_LOAD(x + i + 3*ggml_f32_epr); |
|
|
ay4 = GGML_F32_VEC_LOAD(y + i + 3*ggml_f32_epr); |
|
|
sum4 = GGML_F32_VEC_FMA(sum4, ax4, ay4); |
|
|
|
|
|
ax5 = GGML_F32_VEC_LOAD(x + i + 4*ggml_f32_epr); |
|
|
ay5 = GGML_F32_VEC_LOAD(y + i + 4*ggml_f32_epr); |
|
|
sum5 = GGML_F32_VEC_FMA(sum5, ax5, ay5); |
|
|
|
|
|
ax6 = GGML_F32_VEC_LOAD(x + i + 5*ggml_f32_epr); |
|
|
ay6 = GGML_F32_VEC_LOAD(y + i + 5*ggml_f32_epr); |
|
|
sum6 = GGML_F32_VEC_FMA(sum6, ax6, ay6); |
|
|
|
|
|
ax7 = GGML_F32_VEC_LOAD(x + i + 6*ggml_f32_epr); |
|
|
ay7 = GGML_F32_VEC_LOAD(y + i + 6*ggml_f32_epr); |
|
|
sum7 = GGML_F32_VEC_FMA(sum7, ax7, ay7); |
|
|
|
|
|
ax8 = GGML_F32_VEC_LOAD(x + i + 7*ggml_f32_epr); |
|
|
ay8 = GGML_F32_VEC_LOAD(y + i + 7*ggml_f32_epr); |
|
|
sum8 = GGML_F32_VEC_FMA(sum8, ax8, ay8); |
|
|
} |
|
|
|
|
|
|
|
|
const int np2 = (n & ~(ggml_f32_epr - 1)); |
|
|
for (int i = np; i < np2; i += ggml_f32_epr) { |
|
|
ax1 = GGML_F32_VEC_LOAD(x + i); |
|
|
ay1 = GGML_F32_VEC_LOAD(y + i); |
|
|
sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); |
|
|
} |
|
|
|
|
|
if (np2 < n) { |
|
|
svbool_t pg = svwhilelt_b32(np2, n); |
|
|
ax1 = svld1_f32(pg, x + np2); |
|
|
ay1 = svld1_f32(pg, y + np2); |
|
|
sum1 = svmad_f32_m(pg, ax1, ay1, sum1); |
|
|
} |
|
|
|
|
|
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8); |
|
|
#elif defined(__riscv_v_intrinsic) |
|
|
int vl = __riscv_vsetvlmax_e32m8(); |
|
|
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1); |
|
|
vfloat32m8_t vsum; |
|
|
vfloat32m8_t ax; |
|
|
vfloat32m8_t ay; |
|
|
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl); |
|
|
for (int i = 0; i < n; i += vl) { |
|
|
vl = __riscv_vsetvl_e32m8(n - i); |
|
|
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl); |
|
|
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl); |
|
|
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl); |
|
|
} |
|
|
vl = __riscv_vsetvlmax_e32m8(); |
|
|
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl); |
|
|
sumf += __riscv_vfmv_f_s_f32m1_f32(vs); |
|
|
#else |
|
|
const int np = (n & ~(GGML_F32_STEP - 1)); |
|
|
|
|
|
GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; |
|
|
|
|
|
GGML_F32_VEC ax[GGML_F32_ARR]; |
|
|
GGML_F32_VEC ay[GGML_F32_ARR]; |
|
|
|
|
|
for (int i = 0; i < np; i += GGML_F32_STEP) { |
|
|
for (int j = 0; j < GGML_F32_ARR; j++) { |
|
|
ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR); |
|
|
ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR); |
|
|
|
|
|
sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
GGML_F32_VEC_REDUCE(sumf, sum); |
|
|
|
|
|
|
|
|
for (int i = np; i < n; ++i) { |
|
|
sumf += x[i]*y[i]; |
|
|
} |
|
|
#endif |
|
|
#else |
|
|
|
|
|
ggml_float sumf = 0.0; |
|
|
for (int i = 0; i < n; ++i) { |
|
|
sumf += (ggml_float)(x[i]*y[i]); |
|
|
} |
|
|
#endif |
|
|
|
|
|
*s = sumf; |
|
|
} |
|
|
|
|
|
void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) { |
|
|
assert(nrc == 1); |
|
|
GGML_UNUSED(nrc); |
|
|
GGML_UNUSED(bx); |
|
|
GGML_UNUSED(by); |
|
|
GGML_UNUSED(bs); |
|
|
int i = 0; |
|
|
ggml_float sumf = 0; |
|
|
|
|
|
#if defined(__AVX512BF16__) |
|
|
__m512 c1 = _mm512_setzero_ps(); |
|
|
__m512 c2 = _mm512_setzero_ps(); |
|
|
for (; i + 64 <= n; i += 64) { |
|
|
c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))), |
|
|
m512bh(_mm512_loadu_si512((y + i)))); |
|
|
c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))), |
|
|
m512bh(_mm512_loadu_si512((y + i + 32)))); |
|
|
} |
|
|
sumf += (ggml_float)_mm512_reduce_add_ps(c1); |
|
|
sumf += (ggml_float)_mm512_reduce_add_ps(c2); |
|
|
|
|
|
#elif defined(__AVX512F__) |
|
|
#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) |
|
|
__m512 c1 = _mm512_setzero_ps(); |
|
|
__m512 c2 = _mm512_setzero_ps(); |
|
|
for (; i + 32 <= n; i += 32) { |
|
|
c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); |
|
|
c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); |
|
|
} |
|
|
sumf += (ggml_float)_mm512_reduce_add_ps(c1); |
|
|
sumf += (ggml_float)_mm512_reduce_add_ps(c2); |
|
|
|
|
|
#undef LOAD |
|
|
#elif defined(__AVX2__) || defined(__AVX__) |
|
|
#if defined(__AVX2__) |
|
|
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) |
|
|
#else |
|
|
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1)) |
|
|
#endif |
|
|
__m256 c1 = _mm256_setzero_ps(); |
|
|
__m256 c2 = _mm256_setzero_ps(); |
|
|
__m256 c3 = _mm256_setzero_ps(); |
|
|
__m256 c4 = _mm256_setzero_ps(); |
|
|
for (; i + 32 <= n; i += 32) { |
|
|
c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); |
|
|
c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); |
|
|
c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); |
|
|
c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); |
|
|
} |
|
|
__m128 g; |
|
|
c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), |
|
|
_mm256_add_ps(c2, c4)); |
|
|
g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), |
|
|
_mm256_castps256_ps128(c1)); |
|
|
g = _mm_add_ps(g, _mm_movehl_ps(g, g)); |
|
|
g = _mm_add_ss(g, _mm_movehdup_ps(g)); |
|
|
sumf += (ggml_float)_mm_cvtss_f32(g); |
|
|
|
|
|
#undef LOAD |
|
|
#endif |
|
|
|
|
|
for (; i < n; ++i) { |
|
|
sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * |
|
|
GGML_BF16_TO_FP32(y[i])); |
|
|
} |
|
|
*s = sumf; |
|
|
} |
|
|
|
|
|
void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) { |
|
|
assert(nrc == 1); |
|
|
GGML_UNUSED(nrc); |
|
|
GGML_UNUSED(bx); |
|
|
GGML_UNUSED(by); |
|
|
GGML_UNUSED(bs); |
|
|
|
|
|
ggml_float sumf = 0.0; |
|
|
|
|
|
|
|
|
#if defined(GGML_SIMD) |
|
|
#if defined(__ARM_FEATURE_SVE) |
|
|
const int sve_register_length = svcntb() * 8; |
|
|
const int ggml_f16_epr = sve_register_length / 16; |
|
|
const int ggml_f16_step = 8 * ggml_f16_epr; |
|
|
|
|
|
const int np= (n & ~(ggml_f16_step - 1)); |
|
|
svfloat16_t sum1 = svdup_n_f16(0.0f); |
|
|
svfloat16_t sum2 = svdup_n_f16(0.0f); |
|
|
svfloat16_t sum3 = svdup_n_f16(0.0f); |
|
|
svfloat16_t sum4 = svdup_n_f16(0.0f); |
|
|
|
|
|
svfloat16_t ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8; |
|
|
svfloat16_t ay1, ay2, ay3, ay4, ay5, ay6, ay7, ay8; |
|
|
for (int i = 0; i < np; i += ggml_f16_step) { |
|
|
ax1 = GGML_F16x_VEC_LOAD(x + i + 0 * ggml_f16_epr, 0); |
|
|
ay1 = GGML_F16x_VEC_LOAD(y + i + 0 * ggml_f16_epr, 0); |
|
|
sum1 = GGML_F16x_VEC_FMA(sum1, ax1, ay1); |
|
|
|
|
|
ax2 = GGML_F16x_VEC_LOAD(x + i + 1 * ggml_f16_epr, 1); |
|
|
ay2 = GGML_F16x_VEC_LOAD(y + i + 1 * ggml_f16_epr, 1); |
|
|
sum2 = GGML_F16x_VEC_FMA(sum2, ax2, ay2); |
|
|
|
|
|
ax3 = GGML_F16x_VEC_LOAD(x + i + 2 * ggml_f16_epr, 2); |
|
|
ay3 = GGML_F16x_VEC_LOAD(y + i + 2 * ggml_f16_epr, 2); |
|
|
sum3 = GGML_F16x_VEC_FMA(sum3, ax3, ay3); |
|
|
|
|
|
ax4 = GGML_F16x_VEC_LOAD(x + i + 3 * ggml_f16_epr, 3); |
|
|
ay4 = GGML_F16x_VEC_LOAD(y + i + 3 * ggml_f16_epr, 3); |
|
|
sum4 = GGML_F16x_VEC_FMA(sum4, ax4, ay4); |
|
|
|
|
|
ax5 = GGML_F16x_VEC_LOAD(x + i + 4 * ggml_f16_epr, 4); |
|
|
ay5 = GGML_F16x_VEC_LOAD(y + i + 4 * ggml_f16_epr, 4); |
|
|
sum1 = GGML_F16x_VEC_FMA(sum1, ax5, ay5); |
|
|
|
|
|
ax6 = GGML_F16x_VEC_LOAD(x + i + 5 * ggml_f16_epr, 5); |
|
|
ay6 = GGML_F16x_VEC_LOAD(y + i + 5 * ggml_f16_epr, 5); |
|
|
sum2 = GGML_F16x_VEC_FMA(sum2, ax6, ay6); |
|
|
|
|
|
ax7 = GGML_F16x_VEC_LOAD(x + i + 6 * ggml_f16_epr, 6); |
|
|
ay7 = GGML_F16x_VEC_LOAD(y + i + 6 * ggml_f16_epr, 6); |
|
|
sum3 = GGML_F16x_VEC_FMA(sum3, ax7, ay7); |
|
|
|
|
|
ax8 = GGML_F16x_VEC_LOAD(x + i + 7 * ggml_f16_epr, 7); |
|
|
ay8 = GGML_F16x_VEC_LOAD(y + i + 7 * ggml_f16_epr, 7); |
|
|
sum4 = GGML_F16x_VEC_FMA(sum4, ax8, ay8); |
|
|
} |
|
|
|
|
|
const int np2 = (n & ~(ggml_f16_epr - 1)); |
|
|
for (int k = np; k < np2; k += ggml_f16_epr) { |
|
|
svfloat16_t rx = GGML_F16x_VEC_LOAD(x + k, 0); |
|
|
svfloat16_t ry = GGML_F16x_VEC_LOAD(y + k, 0); |
|
|
sum1 = GGML_F16x_VEC_FMA(sum1, rx, ry); |
|
|
} |
|
|
|
|
|
if (np2 < n) { |
|
|
svbool_t pg = svwhilelt_b16(np2, n); |
|
|
svfloat16_t hx = svld1_f16(pg, (const __fp16 *)(x + np2)); |
|
|
svfloat16_t hy = svld1_f16(pg, (const __fp16 *)(y + np2)); |
|
|
|
|
|
sum1 = svmad_f16_x(pg, hx, hy, sum1); |
|
|
} |
|
|
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4); |
|
|
#elif defined(__riscv_v_intrinsic) |
|
|
#if defined(__riscv_zvfh) |
|
|
int vl = __riscv_vsetvlmax_e32m2(); |
|
|
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1); |
|
|
vfloat32m2_t vsum; |
|
|
vfloat16m1_t ax; |
|
|
vfloat16m1_t ay; |
|
|
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl)); |
|
|
for (int i = 0; i < n; i += vl) { |
|
|
vl = __riscv_vsetvl_e16m1(n - i); |
|
|
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl); |
|
|
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl); |
|
|
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl); |
|
|
} |
|
|
vl = __riscv_vsetvlmax_e32m1(); |
|
|
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl); |
|
|
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl); |
|
|
sumf += __riscv_vfmv_f_s_f32m1_f32(vs); |
|
|
#else |
|
|
for (int i = 0; i < n; ++i) { |
|
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); |
|
|
} |
|
|
#endif |
|
|
#else |
|
|
const int np = (n & ~(GGML_F16_STEP - 1)); |
|
|
|
|
|
GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO }; |
|
|
|
|
|
GGML_F16_VEC ax[GGML_F16_ARR]; |
|
|
GGML_F16_VEC ay[GGML_F16_ARR]; |
|
|
|
|
|
for (int i = 0; i < np; i += GGML_F16_STEP) { |
|
|
for (int j = 0; j < GGML_F16_ARR; j++) { |
|
|
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); |
|
|
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); |
|
|
|
|
|
sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
GGML_F16_VEC_REDUCE(sumf, sum); |
|
|
|
|
|
|
|
|
for (int i = np; i < n; ++i) { |
|
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); |
|
|
} |
|
|
|
|
|
assert(!isnan(sumf) && !isinf(sumf)); |
|
|
#endif |
|
|
#else |
|
|
for (int i = 0; i < n; ++i) { |
|
|
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i])); |
|
|
} |
|
|
#endif |
|
|
|
|
|
*s = sumf; |
|
|
} |
|
|
|
|
|
void ggml_vec_silu_f32(const int n, float * y, const float * x) { |
|
|
int i = 0; |
|
|
#if defined(__AVX512F__) && defined(__AVX512DQ__) |
|
|
for (; i + 15 < n; i += 16) { |
|
|
_mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i))); |
|
|
} |
|
|
#elif defined(__AVX2__) && defined(__FMA__) |
|
|
for (; i + 7 < n; i += 8) { |
|
|
_mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i))); |
|
|
} |
|
|
#elif defined(__SSE2__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
_mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i))); |
|
|
} |
|
|
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
|
|
const int vlen = svcntw(); |
|
|
for (; i < n; i += vlen) { |
|
|
const svbool_t pg = svwhilelt_b32_s32(i, n); |
|
|
svst1_f32(pg, y + i, ggml_v_silu(pg, svld1_f32(pg, x + i))); |
|
|
} |
|
|
#elif defined(__ARM_NEON) && defined(__aarch64__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i))); |
|
|
} |
|
|
#endif |
|
|
for (; i < n; ++i) { |
|
|
y[i] = ggml_silu_f32(x[i]); |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float * g) { |
|
|
int i = 0; |
|
|
#if defined(__AVX512F__) && defined(__AVX512DQ__) |
|
|
for (; i + 15 < n; i += 16) { |
|
|
_mm512_storeu_ps(y + i, _mm512_mul_ps(ggml_v_silu(_mm512_loadu_ps(x + i)), _mm512_loadu_ps(g + i))); |
|
|
} |
|
|
#elif defined(__AVX2__) && defined(__FMA__) |
|
|
for (; i + 7 < n; i += 8) { |
|
|
_mm256_storeu_ps(y + i, _mm256_mul_ps(ggml_v_silu(_mm256_loadu_ps(x + i)), _mm256_loadu_ps(g + i))); |
|
|
} |
|
|
#elif defined(__SSE2__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
_mm_storeu_ps(y + i, _mm_mul_ps(ggml_v_silu(_mm_loadu_ps(x + i)), _mm_loadu_ps(g + i))); |
|
|
} |
|
|
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
|
|
const int vlen = svcntw(); |
|
|
for (; i < n; i += vlen) { |
|
|
const svbool_t pg = svwhilelt_b32_s32(i, n); |
|
|
svst1_f32(pg, y + i, svmul_f32_x(pg, ggml_v_silu(pg, svld1_f32(pg, x + i)), svld1_f32(pg, g + i))); |
|
|
} |
|
|
#elif defined(__ARM_NEON) && defined(__aarch64__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i))); |
|
|
} |
|
|
#elif defined(__riscv_v_intrinsic) |
|
|
for (int vl; i < n; i += vl) { |
|
|
vl = __riscv_vsetvl_e32m2(n - i); |
|
|
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl); |
|
|
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl); |
|
|
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl); |
|
|
__riscv_vse32_v_f32m2(&y[i], vy, vl); |
|
|
} |
|
|
#endif |
|
|
for (; i < n; ++i) { |
|
|
y[i] = ggml_silu_f32(x[i]) * g[i]; |
|
|
} |
|
|
} |
|
|
|
|
|
ggml_float ggml_vec_cvar_f32(const int n, float * y, const float * x, const float mean) { |
|
|
int i = 0; |
|
|
ggml_float sum = 0; |
|
|
|
|
|
|
|
|
#if defined(__AVX512F__) && defined(__AVX512DQ__) |
|
|
for (; i + 15 < n; i += 16) { |
|
|
__m512 val = _mm512_sub_ps(_mm512_loadu_ps(x + i), |
|
|
_mm512_set1_ps(mean)); |
|
|
_mm512_storeu_ps(y + i, val); |
|
|
sum += (ggml_float)_mm512_reduce_add_ps(_mm512_mul_ps(val, val)); |
|
|
} |
|
|
#elif defined(__AVX2__) && defined(__FMA__) |
|
|
for (; i + 7 < n; i += 8) { |
|
|
__m256 val = _mm256_sub_ps(_mm256_loadu_ps(x + i), |
|
|
_mm256_set1_ps(mean)); |
|
|
_mm256_storeu_ps(y + i, val); |
|
|
val = _mm256_mul_ps(val,val); |
|
|
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), |
|
|
_mm256_castps256_ps128(val)); |
|
|
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); |
|
|
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); |
|
|
sum += (ggml_float)_mm_cvtss_f32(val2); |
|
|
} |
|
|
#elif defined(__SSE2__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
__m128 val = _mm_sub_ps(_mm_loadu_ps(x + i), |
|
|
_mm_set1_ps(mean)); |
|
|
_mm_storeu_ps(y + i, val); |
|
|
val = _mm_mul_ps(val, val); |
|
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) |
|
|
val = _mm_add_ps(val, _mm_movehl_ps(val, val)); |
|
|
val = _mm_add_ss(val, _mm_movehdup_ps(val)); |
|
|
#else |
|
|
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); |
|
|
val = _mm_add_ps(val, tmp); |
|
|
tmp = _mm_movehl_ps(tmp, val); |
|
|
val = _mm_add_ss(val, tmp); |
|
|
#endif |
|
|
sum += (ggml_float)_mm_cvtss_f32(val); |
|
|
} |
|
|
#elif defined(__ARM_NEON) && defined(__aarch64__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
float32x4_t val = vsubq_f32(vld1q_f32(x + i), |
|
|
vdupq_n_f32(mean)); |
|
|
vst1q_f32(y + i, val); |
|
|
val = vmulq_f32(val, val); |
|
|
sum += (ggml_float)vaddvq_f32(val); |
|
|
} |
|
|
#elif defined(__VXE__) || defined(__VXE2__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
float32x4_t val = vec_sub(vec_xl(0, x + i), vec_splats(mean)); |
|
|
vec_xst(val, 0, y + i); |
|
|
val = vec_mul(val, val); |
|
|
sum += (ggml_float)vec_hsum_f32x4(val); |
|
|
} |
|
|
#endif |
|
|
for (; i < n; ++i) { |
|
|
float val = x[i] - mean; |
|
|
y[i] = val; |
|
|
val *= val; |
|
|
sum += (ggml_float)val; |
|
|
} |
|
|
return sum/n; |
|
|
} |
|
|
|
|
|
ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) { |
|
|
int i = 0; |
|
|
ggml_float sum = 0; |
|
|
#if defined(__AVX512F__) && defined(__AVX512DQ__) |
|
|
for (; i + 15 < n; i += 16) { |
|
|
__m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i), |
|
|
_mm512_set1_ps(max))); |
|
|
_mm512_storeu_ps(y + i, val); |
|
|
sum += (ggml_float)_mm512_reduce_add_ps(val); |
|
|
} |
|
|
#elif defined(__AVX2__) && defined(__FMA__) |
|
|
for (; i + 7 < n; i += 8) { |
|
|
__m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i), |
|
|
_mm256_set1_ps(max))); |
|
|
_mm256_storeu_ps(y + i, val); |
|
|
__m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1), |
|
|
_mm256_castps256_ps128(val)); |
|
|
val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2)); |
|
|
val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2)); |
|
|
sum += (ggml_float)_mm_cvtss_f32(val2); |
|
|
} |
|
|
#elif defined(__SSE2__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
__m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i), |
|
|
_mm_set1_ps(max))); |
|
|
_mm_storeu_ps(y + i, val); |
|
|
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) |
|
|
val = _mm_add_ps(val, _mm_movehl_ps(val, val)); |
|
|
val = _mm_add_ss(val, _mm_movehdup_ps(val)); |
|
|
#else |
|
|
__m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1)); |
|
|
val = _mm_add_ps(val, tmp); |
|
|
tmp = _mm_movehl_ps(tmp, val); |
|
|
val = _mm_add_ss(val, tmp); |
|
|
#endif |
|
|
sum += (ggml_float)_mm_cvtss_f32(val); |
|
|
} |
|
|
#elif defined(__ARM_FEATURE_SVE) && defined(__aarch64__) |
|
|
const int vlen = svcntw(); |
|
|
for (; i < n; i += vlen) { |
|
|
const svbool_t pg = svwhilelt_b32_s32(i, n); |
|
|
svfloat32_t val = ggml_v_expf(pg, svsub_f32_x(pg, svld1_f32(pg, x + i), |
|
|
svdup_n_f32_x(pg, max))); |
|
|
svst1_f32(pg, y + i, val); |
|
|
sum += (ggml_float)svaddv_f32(pg, val); |
|
|
} |
|
|
#elif defined(__ARM_NEON) && defined(__aarch64__) |
|
|
for (; i + 3 < n; i += 4) { |
|
|
float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i), |
|
|
vdupq_n_f32(max))); |
|
|
vst1q_f32(y + i, val); |
|
|
sum += (ggml_float)vaddvq_f32(val); |
|
|
} |
|
|
#elif defined(__riscv_v_intrinsic) |
|
|
vfloat64m1_t vsum = __riscv_vfmv_v_f_f64m1(0, 1); |
|
|
for (int avl; i < n; i += avl) { |
|
|
avl = __riscv_vsetvl_e32m2(n - i); |
|
|
vfloat32m2_t val = ggml_v_expf_m2(__riscv_vfsub_vf_f32m2(__riscv_vle32_v_f32m2(&x[i], avl), max, avl), avl); |
|
|
__riscv_vse32_v_f32m2(&y[i], val, avl); |
|
|
vsum = __riscv_vfwredusum_vs_f32m2_f64m1(val, vsum, avl); |
|
|
} |
|
|
return (ggml_float)__riscv_vfmv_f_s_f64m1_f64(vsum); |
|
|
#endif |
|
|
for (; i < n; ++i) { |
|
|
float val = expf(x[i] - max); |
|
|
sum += (ggml_float)val; |
|
|
y[i] = val; |
|
|
} |
|
|
return sum; |
|
|
} |
|
|
|
|
|
ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) { |
|
|
|
|
|
|
|
|
int i = 0; |
|
|
ggml_float sum = 0; |
|
|
for (; i < n; ++i) { |
|
|
float val = x[i] - max; |
|
|
y[i] = val; |
|
|
sum += (ggml_float)expf(val); |
|
|
} |
|
|
return sum = (ggml_float)logf(sum); |
|
|
} |
|
|
|