Buckets:
arudradey/ml-cpu-storage / emsdk /upstream /emscripten /cache /sysroot /include /compat /avxintrin.h
| /* | |
| * Copyright 2020 The Emscripten Authors. All rights reserved. | |
| * Emscripten is available under two separate licenses, the MIT license and the | |
| * University of Illinois/NCSA Open Source License. Both these licenses can be | |
| * found in the LICENSE file. | |
| */ | |
| typedef float __m256 __attribute__((__vector_size__(32), __aligned__(32))); | |
| typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); | |
| typedef int32_t __m256i __attribute__((__vector_size__(32), __aligned__(32))); | |
| typedef int32_t __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); | |
| typedef int32_t __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); | |
| typedef struct { | |
| __m128d v0; | |
| __m128d v1; | |
| } __m256d_internal; | |
| typedef struct { | |
| __m128 v0; | |
| __m128 v1; | |
| } __m256_internal; | |
| typedef struct { | |
| __m128i v0; | |
| __m128i v1; | |
| } __m256i_internal; | |
| static __inline__ __m256_internal __m256_to_internal(__m256 a) { | |
| union { | |
| __m256 in; | |
| __m256_internal out; | |
| } ret; | |
| ret.in = a; | |
| return ret.out; | |
| } | |
| static __inline__ __m256 __m256_from_internal(__m256_internal a) { | |
| union { | |
| __m256_internal in; | |
| __m256 out; | |
| } ret; | |
| ret.in = a; | |
| return ret.out; | |
| } | |
| static __inline__ __m256d_internal __m256d_to_internal(__m256d a) { | |
| union { | |
| __m256d in; | |
| __m256d_internal out; | |
| } ret; | |
| ret.in = a; | |
| return ret.out; | |
| } | |
| static __inline__ __m256d __m256d_from_internal(__m256d_internal a) { | |
| union { | |
| __m256d_internal in; | |
| __m256d out; | |
| } ret; | |
| ret.in = a; | |
| return ret.out; | |
| } | |
| static __inline__ __m256i_internal __m256i_to_internal(__m256i a) { | |
| union { | |
| __m256i in; | |
| __m256i_internal out; | |
| } ret; | |
| ret.in = a; | |
| return ret.out; | |
| } | |
| static __inline__ __m256i __m256i_from_internal(__m256i_internal a) { | |
| union { | |
| __m256i_internal in; | |
| __m256i out; | |
| } ret; | |
| ret.in = a; | |
| return ret.out; | |
| } | |
| union __m256_data { | |
| __m256i int_view; | |
| __m256d double_view; | |
| __m256 float_view; | |
| __m128i_u int_u_view; | |
| }; | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_add_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_add_pd(a.v0, b.v0); | |
| ret.v1 = _mm_add_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_add_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_add_ps(a.v0, b.v0); | |
| ret.v1 = _mm_add_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_sub_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_sub_pd(a.v0, b.v0); | |
| ret.v1 = _mm_sub_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_sub_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_sub_ps(a.v0, b.v0); | |
| ret.v1 = _mm_sub_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_addsub_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_addsub_pd(a.v0, b.v0); | |
| ret.v1 = _mm_addsub_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_addsub_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_addsub_ps(a.v0, b.v0); | |
| ret.v1 = _mm_addsub_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_div_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_div_pd(a.v0, b.v0); | |
| ret.v1 = _mm_div_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_div_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_div_ps(a.v0, b.v0); | |
| ret.v1 = _mm_div_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_max_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_max_pd(a.v0, b.v0); | |
| ret.v1 = _mm_max_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_max_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_max_ps(a.v0, b.v0); | |
| ret.v1 = _mm_max_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_min_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_min_pd(a.v0, b.v0); | |
| ret.v1 = _mm_min_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_min_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_min_ps(a.v0, b.v0); | |
| ret.v1 = _mm_min_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_mul_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_mul_pd(a.v0, b.v0); | |
| ret.v1 = _mm_mul_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_mul_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_mul_ps(a.v0, b.v0); | |
| ret.v1 = _mm_mul_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_sqrt_pd(__m256d __a) { | |
| __m256d_internal ret, a; | |
| a = __m256d_to_internal(__a); | |
| ret.v0 = _mm_sqrt_pd(a.v0); | |
| ret.v1 = _mm_sqrt_pd(a.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_sqrt_ps(__m256 __a) { | |
| __m256_internal ret, a; | |
| a = __m256_to_internal(__a); | |
| ret.v0 = _mm_sqrt_ps(a.v0); | |
| ret.v1 = _mm_sqrt_ps(a.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_rsqrt_ps(__m256 __a) { | |
| __m256_internal ret, a; | |
| a = __m256_to_internal(__a); | |
| ret.v0 = _mm_rsqrt_ps(a.v0); | |
| ret.v1 = _mm_rsqrt_ps(a.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_rcp_ps(__m256 __a) { | |
| __m256_internal ret, a; | |
| a = __m256_to_internal(__a); | |
| ret.v0 = _mm_rcp_ps(a.v0); | |
| ret.v1 = _mm_rcp_ps(a.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_round_pd(__m256d __a, int __rounding) { | |
| __m256d_internal ret, a; | |
| a = __m256d_to_internal(__a); | |
| ret.v0 = _mm_round_pd(a.v0, __rounding); | |
| ret.v1 = _mm_round_pd(a.v1, __rounding); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_round_ps(__m256 __a, int __rounding) { | |
| __m256_internal ret, a; | |
| a = __m256_to_internal(__a); | |
| ret.v0 = _mm_round_ps(a.v0, __rounding); | |
| ret.v1 = _mm_round_ps(a.v1, __rounding); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d | |
| __attribute__((__always_inline__, __nodebug__)) _mm256_and_pd(__m256d __a, | |
| __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_and_pd(a.v0, b.v0); | |
| ret.v1 = _mm_and_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_and_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_and_ps(a.v0, b.v0); | |
| ret.v1 = _mm_and_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_andnot_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_andnot_pd(a.v0, b.v0); | |
| ret.v1 = _mm_andnot_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_andnot_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_andnot_ps(a.v0, b.v0); | |
| ret.v1 = _mm_andnot_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_or_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_or_pd(a.v0, b.v0); | |
| ret.v1 = _mm_or_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_or_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_or_ps(a.v0, b.v0); | |
| ret.v1 = _mm_or_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_xor_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_xor_pd(a.v0, b.v0); | |
| ret.v1 = _mm_xor_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_xor_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_xor_ps(a.v0, b.v0); | |
| ret.v1 = _mm_xor_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_hadd_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_hadd_pd(a.v0, b.v0); | |
| ret.v1 = _mm_hadd_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_hadd_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_hadd_ps(a.v0, b.v0); | |
| ret.v1 = _mm_hadd_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_hsub_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_hsub_pd(a.v0, b.v0); | |
| ret.v1 = _mm_hsub_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_hsub_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_hsub_ps(a.v0, b.v0); | |
| ret.v1 = _mm_hsub_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
| _mm_permutevar_pd(__m128d __a, __m128i __c) { | |
| return (__m128d)wasm_f64x2_make( | |
| ((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 0) >> 1) & 1], | |
| ((__f64x2)__a)[(wasm_i64x2_extract_lane(__c, 1) >> 1) & 1]); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_permutevar_pd(__m256d __a, __m256i __c) { | |
| __m256d_internal ret, a; | |
| __m256i_internal c; | |
| a = __m256d_to_internal(__a); | |
| c = __m256i_to_internal(__c); | |
| ret.v0 = _mm_permutevar_pd(a.v0, c.v0); | |
| ret.v1 = _mm_permutevar_pd(a.v1, c.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
| _mm_permutevar_ps(__m128 __a, __m128i __c) { | |
| return (__m128)wasm_f32x4_make( | |
| ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 0) & 3], | |
| ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 1) & 3], | |
| ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 2) & 3], | |
| ((__f32x4)__a)[wasm_i32x4_extract_lane(__c, 3) & 3]); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_permutevar_ps(__m256 __a, __m256i __c) { | |
| __m256_internal ret, a; | |
| __m256i_internal c; | |
| a = __m256_to_internal(__a); | |
| c = __m256i_to_internal(__c); | |
| ret.v0 = _mm_permutevar_ps(a.v0, c.v0); | |
| ret.v1 = _mm_permutevar_ps(a.v1, c.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m128d | |
| __avx_select4d(__m256d __a, __m256d __b, const int imm8) { | |
| __m256d_internal a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| switch (imm8 & 0xF) { | |
| case 0: | |
| case 4: | |
| return a.v0; | |
| case 1: | |
| case 5: | |
| return a.v1; | |
| case 2: | |
| case 6: | |
| return b.v0; | |
| case 3: | |
| case 7: | |
| return b.v1; | |
| default: | |
| return (__m128d)wasm_i64x2_const_splat(0); | |
| } | |
| } | |
| static __inline__ __m128 __avx_select4(__m256 __a, __m256 __b, const int imm8) { | |
| __m256_internal a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| switch (imm8 & 0xF) { | |
| case 0: | |
| case 4: | |
| return a.v0; | |
| case 1: | |
| case 5: | |
| return a.v1; | |
| case 2: | |
| case 6: | |
| return b.v0; | |
| case 3: | |
| case 7: | |
| return b.v1; | |
| default: | |
| return (__m128)wasm_i64x2_const_splat(0); | |
| } | |
| } | |
| static __inline__ __m128i | |
| __avx_select4i(__m256i __a, __m256i __b, const int imm8) { | |
| __m256i_internal a, b; | |
| a = __m256i_to_internal(__a); | |
| b = __m256i_to_internal(__b); | |
| switch (imm8 & 0xF) { | |
| case 0: | |
| case 4: | |
| return a.v0; | |
| case 1: | |
| case 5: | |
| return a.v1; | |
| case 2: | |
| case 6: | |
| return b.v0; | |
| case 3: | |
| case 7: | |
| return b.v1; | |
| default: | |
| return wasm_i64x2_const_splat(0); | |
| } | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_permute2f128_pd(__m256d __a, __m256d __b, const int imm8) { | |
| __m256d_internal ret; | |
| ret.v0 = __avx_select4d(__a, __b, imm8); | |
| ret.v1 = __avx_select4d(__a, __b, imm8 >> 4); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_permute2f128_ps(__m256 __a, __m256 __b, const int imm8) { | |
| __m256_internal ret; | |
| ret.v0 = __avx_select4(__a, __b, imm8); | |
| ret.v1 = __avx_select4(__a, __b, imm8 >> 4); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_permute2f128_si256(__m256i __a, __m256i __b, const int imm8) { | |
| __m256i_internal ret; | |
| ret.v0 = __avx_select4i(__a, __b, imm8); | |
| ret.v1 = __avx_select4i(__a, __b, imm8 >> 4); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) { | |
| __m256d_internal ret, a, b, c; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| c = __m256d_to_internal(__c); | |
| ret.v0 = _mm_blendv_pd(a.v0, b.v0, c.v0); | |
| ret.v1 = _mm_blendv_pd(a.v1, b.v1, c.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) { | |
| __m256_internal ret, a, b, c; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| c = __m256_to_internal(__c); | |
| ret.v0 = _mm_blendv_ps(a.v0, b.v0, c.v0); | |
| ret.v1 = _mm_blendv_ps(a.v1, b.v1, c.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cmp_pd(__m256d __a, __m256d __b, const int imm8) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_cmp_pd(a.v0, b.v0, imm8); | |
| ret.v1 = _mm_cmp_pd(a.v1, b.v1, imm8); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cmp_ps(__m256 __a, __m256 __b, const int imm8) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_cmp_ps(a.v0, b.v0, imm8); | |
| ret.v1 = _mm_cmp_ps(a.v1, b.v1, imm8); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtepi32_pd(__m128i __a) { | |
| __m256d_internal ret; | |
| ret.v0 = _mm_cvtepi32_pd(__a); | |
| __m128i __a1 = wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0); | |
| ret.v1 = _mm_cvtepi32_pd(__a1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtepi32_ps(__m256i __a) { | |
| __m256_internal ret; | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| ret.v0 = _mm_cvtepi32_ps(a.v0); | |
| ret.v1 = _mm_cvtepi32_ps(a.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtpd_ps(__m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| __m128 low = _mm_cvtpd_ps(a.v0); | |
| __m128 high = _mm_cvtpd_ps(a.v1); | |
| __m128 ret = (__m128)wasm_i32x4_shuffle(low, high, 0, 1, 4, 5); | |
| return ret; | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtps_epi32(__m256 __a) { | |
| __m256i_internal ret; | |
| __m256_internal a = __m256_to_internal(__a); | |
| ret.v0 = _mm_cvtps_epi32(a.v0); | |
| ret.v1 = _mm_cvtps_epi32(a.v1); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtps_pd(__m128 __a) { | |
| __m256d_internal ret; | |
| ret.v0 = _mm_cvtps_pd(__a); | |
| __m128 __a1 = (__m128)wasm_i32x4_shuffle(__a, __a, 2, 3, 0, 0); | |
| ret.v1 = _mm_cvtps_pd(__a1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvttpd_epi32(__m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| __m128i low = _mm_cvttpd_epi32(a.v0); | |
| __m128i high = _mm_cvttpd_epi32(a.v1); | |
| __m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5); | |
| return ret; | |
| } | |
| static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtpd_epi32(__m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| __m128i low = _mm_cvtpd_epi32(a.v0); | |
| __m128i high = _mm_cvtpd_epi32(a.v1); | |
| __m128i ret = wasm_i32x4_shuffle(low, high, 0, 1, 4, 5); | |
| return ret; | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvttps_epi32(__m256 __a) { | |
| __m256i_internal ret; | |
| __m256_internal a = __m256_to_internal(__a); | |
| ret.v0 = _mm_cvttps_epi32(a.v0); | |
| ret.v1 = _mm_cvttps_epi32(a.v1); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ double __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtsd_f64(__m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| return _mm_cvtsd_f64(a.v0); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtsi256_si32(__m256i __a) { | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| return _mm_cvtsi128_si32(a.v0); | |
| } | |
| static __inline__ float __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_cvtss_f32(__m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| return _mm_cvtss_f32(a.v0); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_movehdup_ps(__m256 __a) { | |
| __m256_internal ret, a; | |
| a = __m256_to_internal(__a); | |
| ret.v0 = _mm_movehdup_ps(a.v0); | |
| ret.v1 = _mm_movehdup_ps(a.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_moveldup_ps(__m256 __a) { | |
| __m256_internal ret, a; | |
| a = __m256_to_internal(__a); | |
| ret.v0 = _mm_moveldup_ps(a.v0); | |
| ret.v1 = _mm_moveldup_ps(a.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_movedup_pd(__m256d __a) { | |
| __m256d_internal ret, a; | |
| a = __m256d_to_internal(__a); | |
| ret.v0 = _mm_movedup_pd(a.v0); | |
| ret.v1 = _mm_movedup_pd(a.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_unpackhi_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_unpackhi_pd(a.v0, b.v0); | |
| ret.v1 = _mm_unpackhi_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_unpacklo_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal ret, a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| ret.v0 = _mm_unpacklo_pd(a.v0, b.v0); | |
| ret.v1 = _mm_unpacklo_pd(a.v1, b.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_unpackhi_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_unpackhi_ps(a.v0, b.v0); | |
| ret.v1 = _mm_unpackhi_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_unpacklo_ps(__m256 __a, __m256 __b) { | |
| __m256_internal ret, a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| ret.v0 = _mm_unpacklo_ps(a.v0, b.v0); | |
| ret.v1 = _mm_unpacklo_ps(a.v1, b.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm_testz_pd(__m128d __a, __m128d __b) { | |
| v128_t __m = | |
| wasm_u64x2_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 63); | |
| return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm_testc_pd(__m128d __a, __m128d __b) { | |
| v128_t __m = | |
| wasm_u64x2_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 63); | |
| return wasm_i64x2_extract_lane(__m, 0) & wasm_i64x2_extract_lane(__m, 1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm_testnzc_pd(__m128d __a, __m128d __b) { | |
| v128_t __m = wasm_u64x2_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 63); | |
| v128_t __m2 = wasm_u64x2_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 63); | |
| return (wasm_i64x2_extract_lane(__m, 0) | wasm_i64x2_extract_lane(__m, 1)) & | |
| (wasm_i64x2_extract_lane(__m2, 0) | wasm_i64x2_extract_lane(__m2, 1)); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm_testz_ps(__m128 __a, __m128 __b) { | |
| v128_t __m = | |
| wasm_u32x4_shr(wasm_v128_not(wasm_v128_and((v128_t)__a, (v128_t)__b)), 31); | |
| __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m)); | |
| __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1))); | |
| return wasm_i32x4_extract_lane(__m, 0); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm_testc_ps(__m128 __a, __m128 __b) { | |
| v128_t __m = | |
| wasm_u32x4_shr(wasm_v128_or(wasm_v128_not((v128_t)__b), (v128_t)__a), 31); | |
| __m = wasm_v128_and(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m)); | |
| __m = wasm_v128_and(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1))); | |
| return wasm_i32x4_extract_lane(__m, 0); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm_testnzc_ps(__m128 __a, __m128 __b) { | |
| v128_t __m = wasm_u32x4_shr(wasm_v128_and((v128_t)__a, (v128_t)__b), 31); | |
| v128_t __m2 = wasm_u32x4_shr(wasm_v128_andnot((v128_t)__b, (v128_t)__a), 31); | |
| __m = wasm_v128_or(__m, (v128_t)_mm_movehl_ps((__m128)__m, (__m128)__m)); | |
| __m2 = wasm_v128_or(__m2, (v128_t)_mm_movehl_ps((__m128)__m2, (__m128)__m2)); | |
| __m = wasm_v128_or(__m, _mm_shuffle_epi32(__m, _MM_SHUFFLE(3, 2, 0, 1))); | |
| __m2 = wasm_v128_or(__m2, _mm_shuffle_epi32(__m2, _MM_SHUFFLE(3, 2, 0, 1))); | |
| return wasm_i32x4_extract_lane(__m, 0) & wasm_i32x4_extract_lane(__m2, 0); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testz_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| return _mm_testz_pd(a.v0, b.v0) & _mm_testz_pd(a.v1, b.v1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testc_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| return _mm_testc_pd(a.v0, b.v0) & _mm_testc_pd(a.v1, b.v1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testnzc_pd(__m256d __a, __m256d __b) { | |
| __m256d_internal a, b; | |
| a = __m256d_to_internal(__a); | |
| b = __m256d_to_internal(__b); | |
| v128_t __m = | |
| wasm_u64x2_shr(wasm_v128_and((v128_t)a.v0, (v128_t)b.v0), 63); | |
| v128_t __m1 = | |
| wasm_u64x2_shr(wasm_v128_and((v128_t)a.v1, (v128_t)b.v1), 63); | |
| v128_t __m2 = | |
| wasm_u64x2_shr(wasm_v128_andnot((v128_t)b.v0, (v128_t)a.v0), 63); | |
| v128_t __m3 = | |
| wasm_u64x2_shr(wasm_v128_andnot((v128_t)b.v1, (v128_t)a.v1), 63); | |
| return wasm_v128_any_true(wasm_v128_or(__m, __m1)) & | |
| wasm_v128_any_true(wasm_v128_or(__m2, __m3)); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testz_ps(__m256 __a, __m256 __b) { | |
| __m256_internal a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| return _mm_testz_ps(a.v0, b.v0) & _mm_testz_ps(a.v1, b.v1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testc_ps(__m256 __a, __m256 __b) { | |
| __m256_internal a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| return _mm_testc_ps(a.v0, b.v0) & _mm_testc_ps(a.v1, b.v1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testnzc_ps(__m256 __a, __m256 __b) { | |
| __m256_internal a, b; | |
| a = __m256_to_internal(__a); | |
| b = __m256_to_internal(__b); | |
| v128_t __m = | |
| wasm_u32x4_shr(wasm_v128_and((v128_t)a.v0, (v128_t)b.v0), 31); | |
| v128_t __m1 = | |
| wasm_u32x4_shr(wasm_v128_and((v128_t)a.v1, (v128_t)b.v1), 31); | |
| v128_t __m2 = | |
| wasm_u32x4_shr(wasm_v128_andnot((v128_t)b.v0, (v128_t)a.v0), 31); | |
| v128_t __m3 = | |
| wasm_u32x4_shr(wasm_v128_andnot((v128_t)b.v1, (v128_t)a.v1), 31); | |
| return wasm_v128_any_true(wasm_v128_or(__m, __m1)) & | |
| wasm_v128_any_true(wasm_v128_or(__m2, __m3)); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testz_si256(__m256i __a, __m256i __b) { | |
| __m256i_internal a, b; | |
| a = __m256i_to_internal(__a); | |
| b = __m256i_to_internal(__b); | |
| return _mm_testz_si128(a.v0, b.v0) & _mm_testz_si128(a.v1, b.v1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testc_si256(__m256i __a, __m256i __b) { | |
| __m256i_internal a, b; | |
| a = __m256i_to_internal(__a); | |
| b = __m256i_to_internal(__b); | |
| return _mm_testc_si128(a.v0, b.v0) & _mm_testc_si128(a.v1, b.v1); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_testnzc_si256(__m256i __a, __m256i __b) { | |
| __m256i_internal a, b; | |
| a = __m256i_to_internal(__a); | |
| b = __m256i_to_internal(__b); | |
| v128_t __m = wasm_v128_and(a.v0, b.v0); | |
| v128_t __m1 = wasm_v128_and(a.v1, b.v1); | |
| v128_t __m2 = wasm_v128_andnot(b.v0, a.v0); | |
| v128_t __m3 = wasm_v128_andnot(b.v1, a.v1); | |
| return wasm_v128_any_true(wasm_v128_or(__m, __m1)) & | |
| wasm_v128_any_true(wasm_v128_or(__m2, __m3)); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_movemask_pd(__m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| return _mm_movemask_pd(a.v0) | (_mm_movemask_pd(a.v1) << 2); | |
| } | |
| static __inline__ int __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_movemask_ps(__m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| return _mm_movemask_ps(a.v0) | (_mm_movemask_ps(a.v1) << 4); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_zeroall(void) { | |
| // Do nothing | |
| // when porting any assembly code that would have calls to these functions | |
| // around, that assembly code in the first place will not compile. | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_zeroupper(void) { | |
| // Do nothing | |
| // when porting any assembly code that would have calls to these functions | |
| // around, that assembly code in the first place will not compile. | |
| } | |
| static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
| _mm_broadcast_ss(float const* __a) { | |
| return (__m128)wasm_v128_load32_splat(__a); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_broadcast_sd(double const* __a) { | |
| __m256d_internal ret; | |
| ret.v1 = ret.v0 = (__m128d)wasm_v128_load64_splat(__a); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_broadcast_ss(float const* __a) { | |
| __m256_internal ret; | |
| ret.v1 = ret.v0 = _mm_broadcast_ss(__a); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_broadcast_pd(__m128d const* __a) { | |
| __m256d_internal ret; | |
| ret.v1 = ret.v0 = (__m128d)wasm_v128_load(__a); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_broadcast_ps(__m128 const* __a) { | |
| __m256_internal ret; | |
| ret.v1 = ret.v0 = (__m128)wasm_v128_load(__a); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_load_pd(double const* __p) { | |
| __m256d_internal ret; | |
| ret.v0 = _mm_load_pd(__p); | |
| ret.v1 = _mm_load_pd(__p + 2); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_load_ps(float const* __p) { | |
| __m256_internal ret; | |
| ret.v0 = _mm_load_ps(__p); | |
| ret.v1 = _mm_load_ps(__p + 4); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_loadu_pd(double const* __p) { | |
| __m256d_internal ret; | |
| ret.v0 = _mm_loadu_pd(__p); | |
| ret.v1 = _mm_loadu_pd(__p + 2); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_loadu_ps(float const* __p) { | |
| __m256_internal ret; | |
| ret.v0 = _mm_loadu_ps(__p); | |
| ret.v1 = _mm_loadu_ps(__p + 4); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_load_si256(__m256i const* __p) { | |
| __m256i_internal ret; | |
| ret.v0 = _mm_load_si128((__m128i const*)__p); | |
| ret.v1 = _mm_load_si128(((__m128i const*)__p) + 1); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_loadu_si256(__m256i_u const* __p) { | |
| __m256i_internal ret; | |
| ret.v0 = _mm_loadu_si128((__m128i const*)__p); | |
| ret.v1 = _mm_loadu_si128(((__m128i const*)__p) + 1); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_lddqu_si256(__m256i_u const* __p) { | |
| __m256i_internal ret; | |
| ret.v0 = _mm_lddqu_si128((__m128i const*)__p); | |
| ret.v1 = _mm_lddqu_si128(((__m128i const*)__p) + 1); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_store_pd(double* __p, __m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| _mm_store_pd(__p, a.v0); | |
| _mm_store_pd(__p + 2, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_store_ps(float* __p, __m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| _mm_store_ps(__p, a.v0); | |
| _mm_store_ps(__p + 4, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_storeu_pd(double* __p, __m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| _mm_storeu_pd(__p, a.v0); | |
| _mm_storeu_pd(__p + 2, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_storeu_ps(float* __p, __m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| _mm_storeu_ps(__p, a.v0); | |
| _mm_storeu_ps(__p + 4, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_store_si256(__m256i* __p, __m256i __a) { | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| _mm_store_si128((__m128i*)__p, a.v0); | |
| _mm_store_si128(((__m128i*)__p) + 1, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_storeu_si256(__m256i_u* __p, __m256i __a) { | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| _mm_storeu_si128((__m128i*)__p, a.v0); | |
| _mm_storeu_si128(((__m128i*)__p) + 1, a.v1); | |
| } | |
| static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
| _mm_maskload_pd(double const* __p, __m128i __m) { | |
| // This may cause an out-of-bounds memory load since we first load and | |
| // then mask, but since there are no segmentation faults in Wasm memory | |
| // accesses, that is ok (as long as we are within the heap bounds - | |
| // a negligible limitation in practice) | |
| return _mm_and_pd(_mm_load_pd(__p), (__m128d)wasm_i64x2_shr(__m, 63)); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_maskload_pd(double const* __p, __m256i __m) { | |
| __m256d_internal ret; | |
| __m256i_internal m = __m256i_to_internal(__m); | |
| ret.v0 = _mm_maskload_pd(__p, m.v0); | |
| ret.v1 = _mm_maskload_pd(__p + 2, m.v1); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
| _mm_maskload_ps(float const* __p, __m128i __m) { | |
| // This may cause an out-of-bounds memory load since we first load and | |
| // then mask, but since there are no segmentation faults in Wasm memory | |
| // accesses, that is ok (as long as we are within the heap bounds - | |
| // a negligible limitation in practice) | |
| return _mm_and_ps(_mm_load_ps(__p), (__m128)_mm_srai_epi32(__m, 31)); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_maskload_ps(float const* __p, __m256i __m) { | |
| __m256_internal ret; | |
| __m256i_internal m = __m256i_to_internal(__m); | |
| ret.v0 = _mm_maskload_ps(__p, m.v0); | |
| ret.v1 = _mm_maskload_ps(__p + 4, m.v1); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ void | |
| __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) | |
| _mm_maskstore_ps(float* __p, __m128i __m, __m128 __a) { | |
| if ((wasm_i32x4_extract_lane(__m, 0) & 0x80000000ull) != 0) | |
| __p[0] = wasm_f32x4_extract_lane((v128_t)__a, 0); | |
| if ((wasm_i32x4_extract_lane(__m, 1) & 0x80000000ull) != 0) | |
| __p[1] = wasm_f32x4_extract_lane((v128_t)__a, 1); | |
| if ((wasm_i32x4_extract_lane(__m, 2) & 0x80000000ull) != 0) | |
| __p[2] = wasm_f32x4_extract_lane((v128_t)__a, 2); | |
| if ((wasm_i32x4_extract_lane(__m, 3) & 0x80000000ull) != 0) | |
| __p[3] = wasm_f32x4_extract_lane((v128_t)__a, 3); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_maskstore_ps(float* __p, __m256i __m, __m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| __m256i_internal m = __m256i_to_internal(__m); | |
| _mm_maskstore_ps(__p, m.v0, a.v0); | |
| _mm_maskstore_ps(__p + 4, m.v1, a.v1); | |
| } | |
| static __inline__ void | |
| __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW)) | |
| _mm_maskstore_pd(double* __p, __m128i __m, __m128d __a) { | |
| if ((wasm_i64x2_extract_lane(__m, 0) & 0x8000000000000000ull) != 0) | |
| __p[0] = wasm_f64x2_extract_lane((v128_t)__a, 0); | |
| if ((wasm_i64x2_extract_lane(__m, 1) & 0x8000000000000000ull) != 0) | |
| __p[1] = wasm_f64x2_extract_lane((v128_t)__a, 1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_maskstore_pd(double* __p, __m256i __m, __m256d __a) { | |
| __m256i_internal m = __m256i_to_internal(__m); | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| _mm_maskstore_pd(__p, m.v0, a.v0); | |
| _mm_maskstore_pd(__p + 2, m.v1, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_stream_si256(void* __a, __m256i __b) { | |
| __m256i_internal b = __m256i_to_internal(__b); | |
| _mm_stream_si128((__m128i*)__a, b.v0); | |
| _mm_stream_si128(((__m128i*)__a) + 1, b.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_stream_pd(void* __a, __m256d __b) { | |
| __m256d_internal b = __m256d_to_internal(__b); | |
| _mm_stream_pd((double*)__a, b.v0); | |
| _mm_stream_pd(((double*)__a) + 2, b.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_stream_ps(void* __p, __m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| _mm_stream_ps((float*)__p, a.v0); | |
| _mm_stream_ps(((float*)__p) + 4, a.v1); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_undefined_pd(void) { | |
| __m256d val; | |
| return val; | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_undefined_ps(void) { | |
| __m256 val; | |
| return val; | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_undefined_si256(void) { | |
| __m256i val; | |
| return val; | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_pd(double __a, double __b, double __c, double __d) { | |
| __m256d_internal ret; | |
| ret.v0 = _mm_set_pd(__c, __d); | |
| ret.v1 = _mm_set_pd(__a, __b); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_ps(float __a, | |
| float __b, | |
| float __c, | |
| float __d, | |
| float __e, | |
| float __f, | |
| float __g, | |
| float __h) { | |
| __m256_internal ret; | |
| ret.v0 = _mm_set_ps(__e, __f, __g, __h); | |
| ret.v1 = _mm_set_ps(__a, __b, __c, __d); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_epi32(int __i0, | |
| int __i1, | |
| int __i2, | |
| int __i3, | |
| int __i4, | |
| int __i5, | |
| int __i6, | |
| int __i7) { | |
| __m256i_internal ret; | |
| ret.v0 = _mm_set_epi32(__i4, __i5, __i6, __i7); | |
| ret.v1 = _mm_set_epi32(__i0, __i1, __i2, __i3); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_epi16(short __w15, | |
| short __w14, | |
| short __w13, | |
| short __w12, | |
| short __w11, | |
| short __w10, | |
| short __w09, | |
| short __w08, | |
| short __w07, | |
| short __w06, | |
| short __w05, | |
| short __w04, | |
| short __w03, | |
| short __w02, | |
| short __w01, | |
| short __w00) { | |
| __m256i_internal ret; | |
| ret.v0 = | |
| _mm_set_epi16(__w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00); | |
| ret.v1 = | |
| _mm_set_epi16(__w15, __w14, __w13, __w12, __w11, __w10, __w09, __w08); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_epi8(char __b31, | |
| char __b30, | |
| char __b29, | |
| char __b28, | |
| char __b27, | |
| char __b26, | |
| char __b25, | |
| char __b24, | |
| char __b23, | |
| char __b22, | |
| char __b21, | |
| char __b20, | |
| char __b19, | |
| char __b18, | |
| char __b17, | |
| char __b16, | |
| char __b15, | |
| char __b14, | |
| char __b13, | |
| char __b12, | |
| char __b11, | |
| char __b10, | |
| char __b09, | |
| char __b08, | |
| char __b07, | |
| char __b06, | |
| char __b05, | |
| char __b04, | |
| char __b03, | |
| char __b02, | |
| char __b01, | |
| char __b00) { | |
| __m256i_internal ret; | |
| ret.v0 = _mm_set_epi8(__b15, | |
| __b14, | |
| __b13, | |
| __b12, | |
| __b11, | |
| __b10, | |
| __b09, | |
| __b08, | |
| __b07, | |
| __b06, | |
| __b05, | |
| __b04, | |
| __b03, | |
| __b02, | |
| __b01, | |
| __b00); | |
| ret.v1 = _mm_set_epi8(__b31, | |
| __b30, | |
| __b29, | |
| __b28, | |
| __b27, | |
| __b26, | |
| __b25, | |
| __b24, | |
| __b23, | |
| __b22, | |
| __b21, | |
| __b20, | |
| __b19, | |
| __b18, | |
| __b17, | |
| __b16); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) { | |
| __m256i_internal ret; | |
| ret.v0 = _mm_set_epi64x(__c, __d); | |
| ret.v1 = _mm_set_epi64x(__a, __b); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_pd(double __a, double __b, double __c, double __d) { | |
| return _mm256_set_pd(__d, __c, __b, __a); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_ps(float __a, | |
| float __b, | |
| float __c, | |
| float __d, | |
| float __e, | |
| float __f, | |
| float __g, | |
| float __h) { | |
| return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_epi32(int __i0, | |
| int __i1, | |
| int __i2, | |
| int __i3, | |
| int __i4, | |
| int __i5, | |
| int __i6, | |
| int __i7) { | |
| return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_epi16(short __w15, | |
| short __w14, | |
| short __w13, | |
| short __w12, | |
| short __w11, | |
| short __w10, | |
| short __w09, | |
| short __w08, | |
| short __w07, | |
| short __w06, | |
| short __w05, | |
| short __w04, | |
| short __w03, | |
| short __w02, | |
| short __w01, | |
| short __w00) { | |
| return _mm256_set_epi16(__w00, | |
| __w01, | |
| __w02, | |
| __w03, | |
| __w04, | |
| __w05, | |
| __w06, | |
| __w07, | |
| __w08, | |
| __w09, | |
| __w10, | |
| __w11, | |
| __w12, | |
| __w13, | |
| __w14, | |
| __w15); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_epi8(char __b31, | |
| char __b30, | |
| char __b29, | |
| char __b28, | |
| char __b27, | |
| char __b26, | |
| char __b25, | |
| char __b24, | |
| char __b23, | |
| char __b22, | |
| char __b21, | |
| char __b20, | |
| char __b19, | |
| char __b18, | |
| char __b17, | |
| char __b16, | |
| char __b15, | |
| char __b14, | |
| char __b13, | |
| char __b12, | |
| char __b11, | |
| char __b10, | |
| char __b09, | |
| char __b08, | |
| char __b07, | |
| char __b06, | |
| char __b05, | |
| char __b04, | |
| char __b03, | |
| char __b02, | |
| char __b01, | |
| char __b00) { | |
| return _mm256_set_epi8(__b00, | |
| __b01, | |
| __b02, | |
| __b03, | |
| __b04, | |
| __b05, | |
| __b06, | |
| __b07, | |
| __b08, | |
| __b09, | |
| __b10, | |
| __b11, | |
| __b12, | |
| __b13, | |
| __b14, | |
| __b15, | |
| __b16, | |
| __b17, | |
| __b18, | |
| __b19, | |
| __b20, | |
| __b21, | |
| __b22, | |
| __b23, | |
| __b24, | |
| __b25, | |
| __b26, | |
| __b27, | |
| __b28, | |
| __b29, | |
| __b30, | |
| __b31); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) { | |
| return _mm256_set_epi64x(__d, __c, __b, __a); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set1_pd(double __w) { | |
| __m256d_internal ret; | |
| ret.v1 = ret.v0 = (__m128d)wasm_f64x2_splat(__w); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set1_ps(float __w) { | |
| __m256_internal ret; | |
| ret.v1 = ret.v0 = (__m128)wasm_f32x4_splat(__w); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set1_epi32(int __i) { | |
| __m256i_internal ret; | |
| ret.v1 = ret.v0 = wasm_i32x4_splat(__i); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set1_epi16(short __w) { | |
| __m256i_internal ret; | |
| ret.v1 = ret.v0 = wasm_i16x8_splat(__w); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set1_epi8(char __b) { | |
| __m256i_internal ret; | |
| ret.v1 = ret.v0 = wasm_i8x16_splat(__b); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set1_epi64x(long long __q) { | |
| __m256i_internal ret; | |
| ret.v1 = ret.v0 = wasm_i64x2_splat(__q); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setzero_pd(void) { | |
| __m256d_internal ret; | |
| ret.v1 = ret.v0 = _mm_setzero_pd(); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setzero_ps(void) { | |
| __m256_internal ret; | |
| ret.v1 = ret.v0 = _mm_setzero_ps(); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setzero_si256(void) { | |
| __m256i_internal ret; | |
| ret.v1 = ret.v0 = _mm_setzero_si128(); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castpd_ps(__m256d __a) { | |
| union __m256_data ret; | |
| ret.double_view = __a; | |
| return ret.float_view; | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castpd_si256(__m256d __a) { | |
| union __m256_data ret; | |
| ret.double_view = __a; | |
| return ret.int_view; | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castps_pd(__m256 __a) { | |
| union __m256_data ret; | |
| ret.float_view = __a; | |
| return ret.double_view; | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castps_si256(__m256 __a) { | |
| union __m256_data ret; | |
| ret.float_view = __a; | |
| return ret.int_view; | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castsi256_ps(__m256i __a) { | |
| union __m256_data ret; | |
| ret.int_view = __a; | |
| return ret.float_view; | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castsi256_pd(__m256i __a) { | |
| union __m256_data ret; | |
| ret.int_view = __a; | |
| return ret.double_view; | |
| } | |
| static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castpd256_pd128(__m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| return a.v0; | |
| } | |
| static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castps256_ps128(__m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| return a.v0; | |
| } | |
| static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castsi256_si128(__m256i __a) { | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| return a.v0; | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castpd128_pd256(__m128d __a) { | |
| __m256d_internal ret; | |
| ret.v0 = __a; | |
| ret.v1 = _mm_setzero_pd(); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castps128_ps256(__m128 __a) { | |
| __m256_internal ret; | |
| ret.v0 = __a; | |
| ret.v1 = _mm_setzero_ps(); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_castsi128_si256(__m128i __a) { | |
| __m256i_internal ret; | |
| ret.v0 = __a; | |
| ret.v1 = _mm_setzero_si128(); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_zextpd128_pd256(__m128d __a) { | |
| __m256d_internal ret; | |
| ret.v0 = __a; | |
| ret.v1 = _mm_setzero_pd(); | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_zextps128_ps256(__m128 __a) { | |
| __m256_internal ret; | |
| ret.v0 = __a; | |
| ret.v1 = _mm_setzero_ps(); | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_zextsi128_si256(__m128i __a) { | |
| __m256i_internal ret; | |
| ret.v0 = __a; | |
| ret.v1 = _mm_setzero_si128(); | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_insertf128_ps(__m256 __a, __m128 __b, const int imm8) { | |
| __m256_internal ret = __m256_to_internal(__a); | |
| if (imm8 & 0x1) { | |
| ret.v1 = __b; | |
| } else { | |
| ret.v0 = __b; | |
| } | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_insertf128_pd(__m256d __a, __m128d __b, const int imm8) { | |
| __m256d_internal ret = __m256d_to_internal(__a); | |
| if (imm8 & 0x1) { | |
| ret.v1 = __b; | |
| } else { | |
| ret.v0 = __b; | |
| } | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_insertf128_si256(__m256i __a, __m128i __b, const int imm8) { | |
| __m256i_internal ret = __m256i_to_internal(__a); | |
| if (imm8 & 0x1) { | |
| ret.v1 = __b; | |
| } else { | |
| ret.v0 = __b; | |
| } | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m128 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_extractf128_ps(__m256 __a, const int imm8) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| if (imm8 & 0x1) { | |
| return a.v1; | |
| } else { | |
| return a.v0; | |
| } | |
| } | |
| static __inline__ __m128d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_extractf128_pd(__m256d __a, const int imm8) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| if (imm8 & 0x1) { | |
| return a.v1; | |
| } else { | |
| return a.v0; | |
| } | |
| } | |
| static __inline__ __m128i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_extractf128_si256(__m256i __a, const int imm8) { | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| if (imm8 & 0x1) { | |
| return a.v1; | |
| } else { | |
| return a.v0; | |
| } | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_m128(__m128 __hi, __m128 __lo) { | |
| __m256_internal ret; | |
| ret.v0 = __lo; | |
| ret.v1 = __hi; | |
| return __m256_from_internal(ret); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_m128d(__m128d __hi, __m128d __lo) { | |
| __m256d_internal ret; | |
| ret.v0 = __lo; | |
| ret.v1 = __hi; | |
| return __m256d_from_internal(ret); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_set_m128i(__m128i __hi, __m128i __lo) { | |
| __m256i_internal ret; | |
| ret.v0 = __lo; | |
| ret.v1 = __hi; | |
| return __m256i_from_internal(ret); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_m128(__m128 __lo, __m128 __hi) { | |
| return _mm256_set_m128(__hi, __lo); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_m128d(__m128d __lo, __m128d __hi) { | |
| return (__m256d)_mm256_set_m128d(__hi, __lo); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_setr_m128i(__m128i __lo, __m128i __hi) { | |
| return (__m256i)_mm256_set_m128i(__hi, __lo); | |
| } | |
| static __inline__ __m256 __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_loadu2_m128(float const* __addr_hi, float const* __addr_lo) { | |
| return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo)); | |
| } | |
| static __inline__ __m256d __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_loadu2_m128d(double const* __addr_hi, double const* __addr_lo) { | |
| return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo)); | |
| } | |
| static __inline__ __m256i __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_loadu2_m128i(__m128i_u const* __addr_hi, __m128i_u const* __addr_lo) { | |
| return _mm256_set_m128i(_mm_loadu_si128((__m128i const*)__addr_hi), | |
| _mm_loadu_si128((__m128i const*)__addr_lo)); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_storeu2_m128(float* __addr_hi, float* __addr_lo, __m256 __a) { | |
| __m256_internal a = __m256_to_internal(__a); | |
| _mm_storeu_ps(__addr_lo, a.v0); | |
| _mm_storeu_ps(__addr_hi, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_storeu2_m128d(double* __addr_hi, double* __addr_lo, __m256d __a) { | |
| __m256d_internal a = __m256d_to_internal(__a); | |
| _mm_storeu_pd(__addr_lo, a.v0); | |
| _mm_storeu_pd(__addr_hi, a.v1); | |
| } | |
| static __inline__ void __attribute__((__always_inline__, __nodebug__)) | |
| _mm256_storeu2_m128i(__m128i_u* __addr_hi, __m128i_u* __addr_lo, __m256i __a) { | |
| __m256i_internal a = __m256i_to_internal(__a); | |
| _mm_storeu_si128((__m128i*)__addr_lo, a.v0); | |
| _mm_storeu_si128((__m128i*)__addr_hi, a.v1); | |
| } | |
Xet Storage Details
- Size:
- 88.2 kB
- Xet hash:
- c9d3dbaecad21e5c15865acc60210a56337a9e4a1e3305ee70d5db91ce5fee17
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.