| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #ifndef ARM_USABILITY_H |
| | #define ARM_USABILITY_H |
| |
|
| | static inline signed char float2int8(float v) |
| | { |
| | int int32 = round(v); |
| | if (int32 > 127) return 127; |
| | if (int32 < -127) return -127; |
| | return (signed char)int32; |
| | } |
| |
|
| | #if __ARM_NEON |
| | #include <arm_neon.h> |
| |
|
| | static inline uint16x4_t float2bfloat(float32x4_t _v) |
| | { |
| | return vshrn_n_u32(vreinterpretq_u32_f32(_v), 16); |
| | } |
| | static inline float32x4_t bfloat2float(uint16x4_t _v) |
| | { |
| | return vreinterpretq_f32_u32(vshll_n_u16(_v, 16)); |
| | } |
| |
|
| | static inline int8x8_t float2int8(float32x4_t _vlow, float32x4_t _vhigh) |
| | { |
| | #if __aarch64__ |
| | int32x4_t _vlow32 = vcvtaq_s32_f32(_vlow); |
| | int32x4_t _vhigh32 = vcvtaq_s32_f32(_vhigh); |
| | #else |
| | |
| | |
| | float32x4_t _p5 = vdupq_n_f32(0.5f); |
| | int32x4_t _signmask = vdupq_n_s32(1 << 31); |
| | int32x4_t _signlow = vandq_s32(vreinterpretq_s32_f32(_vlow), _signmask); |
| | int32x4_t _signhigh = vandq_s32(vreinterpretq_s32_f32(_vhigh), _signmask); |
| | float32x4_t _p5low = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow)); |
| | float32x4_t _p5high = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh)); |
| | float32x4_t _vlow5 = vaddq_f32(_vlow, _p5low); |
| | float32x4_t _vhigh5 = vaddq_f32(_vhigh, _p5high); |
| | int32x4_t _vlow32 = vcvtq_s32_f32(_vlow5); |
| | int32x4_t _vhigh32 = vcvtq_s32_f32(_vhigh5); |
| | #endif |
| | int16x8_t _v16 = vcombine_s16(vqmovn_s32(_vlow32), vqmovn_s32(_vhigh32)); |
| | int8x8_t _v8 = vqmovn_s16(_v16); |
| | return vmax_s8(_v8, vdup_n_s8(-127)); |
| | } |
| |
|
| | static inline int8x8_t float2int8relu(float32x4_t _vlow, float32x4_t _vhigh) |
| | { |
| | #if __aarch64__ |
| | int32x4_t _vlow32 = vcvtaq_s32_f32(_vlow); |
| | int32x4_t _vhigh32 = vcvtaq_s32_f32(_vhigh); |
| | #else |
| | |
| | |
| | float32x4_t _p5 = vdupq_n_f32(0.5f); |
| | int32x4_t _signmask = vdupq_n_s32(1 << 31); |
| | int32x4_t _signlow = vandq_s32(vreinterpretq_s32_f32(_vlow), _signmask); |
| | int32x4_t _signhigh = vandq_s32(vreinterpretq_s32_f32(_vhigh), _signmask); |
| | float32x4_t _p5low = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow)); |
| | float32x4_t _p5high = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh)); |
| | float32x4_t _vlow5 = vaddq_f32(_vlow, _p5low); |
| | float32x4_t _vhigh5 = vaddq_f32(_vhigh, _p5high); |
| | int32x4_t _vlow32 = vcvtq_s32_f32(_vlow5); |
| | int32x4_t _vhigh32 = vcvtq_s32_f32(_vhigh5); |
| | #endif |
| | int16x8_t _v16 = vcombine_s16(vqmovn_s32(_vlow32), vqmovn_s32(_vhigh32)); |
| | int8x8_t _v8 = vqmovn_s16(_v16); |
| | return vmax_s8(_v8, vdup_n_s8(0)); |
| | } |
| |
|
| | static inline int8x8_t float2int8leakyrelu(float32x4_t _vlow, float32x4_t _vhigh, float32x4_t _slope) |
| | { |
| | float32x4_t _vlow_leaky = vmulq_f32(_vlow, _slope); |
| | float32x4_t _vhigh_leaky = vmulq_f32(_vhigh, _slope); |
| | #if __aarch64__ |
| | int32x4_t _vlow32 = vcvtaq_s32_f32(_vlow); |
| | int32x4_t _vhigh32 = vcvtaq_s32_f32(_vhigh); |
| | int32x4_t _vlow32_leaky = vcvtaq_s32_f32(_vlow_leaky); |
| | int32x4_t _vhigh32_leaky = vcvtaq_s32_f32(_vhigh_leaky); |
| | #else |
| | |
| | |
| | float32x4_t _p5 = vdupq_n_f32(0.5f); |
| | int32x4_t _signmask = vdupq_n_s32(1 << 31); |
| | int32x4_t _signlow = vandq_s32(vreinterpretq_s32_f32(_vlow), _signmask); |
| | int32x4_t _signhigh = vandq_s32(vreinterpretq_s32_f32(_vhigh), _signmask); |
| | float32x4_t _p5low = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow)); |
| | float32x4_t _p5high = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh)); |
| | float32x4_t _vlow5 = vaddq_f32(_vlow, _p5low); |
| | float32x4_t _vhigh5 = vaddq_f32(_vhigh, _p5high); |
| | int32x4_t _vlow32 = vcvtq_s32_f32(_vlow5); |
| | int32x4_t _vhigh32 = vcvtq_s32_f32(_vhigh5); |
| |
|
| | int32x4_t _signlow_leaky = vandq_s32(vreinterpretq_s32_f32(_vlow_leaky), _signmask); |
| | int32x4_t _signhigh_leaky = vandq_s32(vreinterpretq_s32_f32(_vhigh_leaky), _signmask); |
| | float32x4_t _p5low_leaky = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signlow_leaky)); |
| | float32x4_t _p5high_leaky = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(_p5), _signhigh_leaky)); |
| | float32x4_t _vlow5_leaky = vaddq_f32(_vlow_leaky, _p5low_leaky); |
| | float32x4_t _vhigh5_leaky = vaddq_f32(_vhigh_leaky, _p5high_leaky); |
| | int32x4_t _vlow32_leaky = vcvtq_s32_f32(_vlow5_leaky); |
| | int32x4_t _vhigh32_leaky = vcvtq_s32_f32(_vhigh5_leaky); |
| | #endif |
| | int16x8_t _v16 = vcombine_s16(vqmovn_s32(_vlow32), vqmovn_s32(_vhigh32)); |
| | int16x8_t _v16_leaky = vcombine_s16(vqmovn_s32(_vlow32_leaky), vqmovn_s32(_vhigh32_leaky)); |
| | int8x8_t _v8 = vqmovn_s16(_v16); |
| | int8x8_t _v8_leaky = vqmovn_s16(_v16_leaky); |
| | return vmax_s8(_v8, _v8_leaky); |
| | } |
| |
|
| | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC |
| | static inline signed char float2int8(__fp16 v) |
| | { |
| | int int32 = round(v); |
| | if (int32 > 127) return 127; |
| | if (int32 < -127) return -127; |
| | return (signed char)int32; |
| | } |
| |
|
| | static inline int8x8_t float2int8(float16x8_t _v) |
| | { |
| | int16x8_t _v16 = vcvtaq_s16_f16(_v); |
| | int8x8_t _v8 = vqmovn_s16(_v16); |
| | return vmax_s8(_v8, vdup_n_s8(-127)); |
| | } |
| | #endif |
| |
|
| | static inline void transpose4x4_u16(uint16x4_t& _r0, uint16x4_t& _r1, uint16x4_t& _r2, uint16x4_t& _r3) |
| | { |
| | uint16x4x2_t _r01z = vzip_u16(_r0, _r1); |
| | uint16x4x2_t _r23z = vzip_u16(_r2, _r3); |
| | uint32x2x2_t _r01 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0])); |
| | uint32x2x2_t _r23 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1])); |
| | _r0 = vreinterpret_u16_u32(_r01.val[0]); |
| | _r1 = vreinterpret_u16_u32(_r01.val[1]); |
| | _r2 = vreinterpret_u16_u32(_r23.val[0]); |
| | _r3 = vreinterpret_u16_u32(_r23.val[1]); |
| | } |
| |
|
| | static inline void transpose4x8_u16(uint16x4_t& _r0, uint16x4_t& _r1, uint16x4_t& _r2, uint16x4_t& _r3, uint16x4_t& _r4, uint16x4_t& _r5, uint16x4_t& _r6, uint16x4_t& _r7) |
| | { |
| | uint16x4x2_t _r01z = vzip_u16(_r0, _r1); |
| | uint16x4x2_t _r23z = vzip_u16(_r2, _r3); |
| | uint16x4x2_t _r45z = vzip_u16(_r4, _r5); |
| | uint16x4x2_t _r67z = vzip_u16(_r6, _r7); |
| | uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0])); |
| | uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1])); |
| | uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0])); |
| | uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1])); |
| | _r0 = vreinterpret_u16_u32(_r01_0.val[0]); |
| | _r1 = vreinterpret_u16_u32(_r01_1.val[0]); |
| | _r2 = vreinterpret_u16_u32(_r01_0.val[1]); |
| | _r3 = vreinterpret_u16_u32(_r01_1.val[1]); |
| | _r4 = vreinterpret_u16_u32(_r23_0.val[0]); |
| | _r5 = vreinterpret_u16_u32(_r23_1.val[0]); |
| | _r6 = vreinterpret_u16_u32(_r23_0.val[1]); |
| | _r7 = vreinterpret_u16_u32(_r23_1.val[1]); |
| | } |
| |
|
| | static inline void transpose4x12_u16(uint16x4_t& _r0, uint16x4_t& _r1, uint16x4_t& _r2, uint16x4_t& _r3, uint16x4_t& _r4, uint16x4_t& _r5, uint16x4_t& _r6, uint16x4_t& _r7, uint16x4_t& _r8, uint16x4_t& _r9, uint16x4_t& _ra, uint16x4_t& _rb) |
| | { |
| | uint16x4x2_t _r01z = vzip_u16(_r0, _r1); |
| | uint16x4x2_t _r23z = vzip_u16(_r2, _r3); |
| | uint16x4x2_t _r45z = vzip_u16(_r4, _r5); |
| | uint16x4x2_t _r67z = vzip_u16(_r6, _r7); |
| | uint16x4x2_t _r89z = vzip_u16(_r8, _r9); |
| | uint16x4x2_t _rabz = vzip_u16(_ra, _rb); |
| | uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0])); |
| | uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1])); |
| | uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0])); |
| | uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1])); |
| | uint32x2x2_t _r01_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[0]), vreinterpret_u32_u16(_rabz.val[0])); |
| | uint32x2x2_t _r23_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[1]), vreinterpret_u32_u16(_rabz.val[1])); |
| | _r0 = vreinterpret_u16_u32(_r01_0.val[0]); |
| | _r1 = vreinterpret_u16_u32(_r01_1.val[0]); |
| | _r2 = vreinterpret_u16_u32(_r01_2.val[0]); |
| | _r3 = vreinterpret_u16_u32(_r01_0.val[1]); |
| | _r4 = vreinterpret_u16_u32(_r01_1.val[1]); |
| | _r5 = vreinterpret_u16_u32(_r01_2.val[1]); |
| | _r6 = vreinterpret_u16_u32(_r23_0.val[0]); |
| | _r7 = vreinterpret_u16_u32(_r23_1.val[0]); |
| | _r8 = vreinterpret_u16_u32(_r23_2.val[0]); |
| | _r9 = vreinterpret_u16_u32(_r23_0.val[1]); |
| | _ra = vreinterpret_u16_u32(_r23_1.val[1]); |
| | _rb = vreinterpret_u16_u32(_r23_2.val[1]); |
| | } |
| |
|
| | static inline void transpose8x4_u16(uint16x8_t& _r0, uint16x8_t& _r1, uint16x8_t& _r2, uint16x8_t& _r3) |
| | { |
| | uint16x8x2_t _r01t = vzipq_u16(_r0, _r1); |
| | uint16x8x2_t _r23t = vzipq_u16(_r2, _r3); |
| | uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0])); |
| | uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1])); |
| | _r0 = vreinterpretq_u16_u32(_r01_0.val[0]); |
| | _r1 = vreinterpretq_u16_u32(_r01_0.val[1]); |
| | _r2 = vreinterpretq_u16_u32(_r23_0.val[0]); |
| | _r3 = vreinterpretq_u16_u32(_r23_0.val[1]); |
| | } |
| |
|
| | static inline void transpose8x8_u16(uint16x8_t& _r0, uint16x8_t& _r1, uint16x8_t& _r2, uint16x8_t& _r3, uint16x8_t& _r4, uint16x8_t& _r5, uint16x8_t& _r6, uint16x8_t& _r7) |
| | { |
| | uint16x8x2_t _r01t = vzipq_u16(_r0, _r1); |
| | uint16x8x2_t _r23t = vzipq_u16(_r2, _r3); |
| | uint16x8x2_t _r45t = vzipq_u16(_r4, _r5); |
| | uint16x8x2_t _r67t = vzipq_u16(_r6, _r7); |
| | uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0])); |
| | uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1])); |
| | uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0])); |
| | uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1])); |
| | _r0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0]))); |
| | _r1 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_0.val[0]), vget_high_u32(_r01_1.val[0]))); |
| | _r2 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1]))); |
| | _r3 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_0.val[1]), vget_high_u32(_r01_1.val[1]))); |
| | _r4 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0]))); |
| | _r5 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_0.val[0]), vget_high_u32(_r23_1.val[0]))); |
| | _r6 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1]))); |
| | _r7 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_0.val[1]), vget_high_u32(_r23_1.val[1]))); |
| | } |
| |
|
| | static inline void transpose8x12_u16(uint16x8_t& _r0, uint16x8_t& _r1, uint16x8_t& _r2, uint16x8_t& _r3, uint16x8_t& _r4, uint16x8_t& _r5, uint16x8_t& _r6, uint16x8_t& _r7, uint16x8_t& _r8, uint16x8_t& _r9, uint16x8_t& _ra, uint16x8_t& _rb) |
| | { |
| | uint16x8x2_t _r01t = vzipq_u16(_r0, _r1); |
| | uint16x8x2_t _r23t = vzipq_u16(_r2, _r3); |
| | uint16x8x2_t _r45t = vzipq_u16(_r4, _r5); |
| | uint16x8x2_t _r67t = vzipq_u16(_r6, _r7); |
| | uint16x8x2_t _r89t = vzipq_u16(_r8, _r9); |
| | uint16x8x2_t _rabt = vzipq_u16(_ra, _rb); |
| | uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0])); |
| | uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1])); |
| | uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0])); |
| | uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1])); |
| | uint32x4x2_t _r01_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[0]), vreinterpretq_u32_u16(_rabt.val[0])); |
| | uint32x4x2_t _r23_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[1]), vreinterpretq_u32_u16(_rabt.val[1])); |
| | _r0 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0]))); |
| | _r1 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_2.val[0]), vget_high_u32(_r01_0.val[0]))); |
| | _r2 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_1.val[0]), vget_high_u32(_r01_2.val[0]))); |
| | _r3 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1]))); |
| | _r4 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r01_2.val[1]), vget_high_u32(_r01_0.val[1]))); |
| | _r5 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r01_1.val[1]), vget_high_u32(_r01_2.val[1]))); |
| | _r6 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0]))); |
| | _r7 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_2.val[0]), vget_high_u32(_r23_0.val[0]))); |
| | _r8 = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_1.val[0]), vget_high_u32(_r23_2.val[0]))); |
| | _r9 = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1]))); |
| | _ra = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(_r23_2.val[1]), vget_high_u32(_r23_0.val[1]))); |
| | _rb = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(_r23_1.val[1]), vget_high_u32(_r23_2.val[1]))); |
| | } |
| |
|
| | static inline void transpose4x4_ps(float32x4_t& _r0, float32x4_t& _r1, float32x4_t& _r2, float32x4_t& _r3) |
| | { |
| | float32x4x2_t _r01z = vzipq_f32(_r0, _r1); |
| | float32x4x2_t _r23z = vzipq_f32(_r2, _r3); |
| | _r0 = vcombine_f32(vget_low_f32(_r01z.val[0]), vget_low_f32(_r23z.val[0])); |
| | _r1 = vcombine_f32(vget_high_f32(_r01z.val[0]), vget_high_f32(_r23z.val[0])); |
| | _r2 = vcombine_f32(vget_low_f32(_r01z.val[1]), vget_low_f32(_r23z.val[1])); |
| | _r3 = vcombine_f32(vget_high_f32(_r01z.val[1]), vget_high_f32(_r23z.val[1])); |
| | } |
| |
|
| | static inline void transpose4x8_ps(float32x4_t& _r0, float32x4_t& _r1, float32x4_t& _r2, float32x4_t& _r3, float32x4_t& _r4, float32x4_t& _r5, float32x4_t& _r6, float32x4_t& _r7) |
| | { |
| | float32x4x2_t _r01z = vzipq_f32(_r0, _r1); |
| | float32x4x2_t _r23z = vzipq_f32(_r2, _r3); |
| | float32x4x2_t _r45z = vzipq_f32(_r4, _r5); |
| | float32x4x2_t _r67z = vzipq_f32(_r6, _r7); |
| | _r0 = vcombine_f32(vget_low_f32(_r01z.val[0]), vget_low_f32(_r23z.val[0])); |
| | _r1 = vcombine_f32(vget_low_f32(_r45z.val[0]), vget_low_f32(_r67z.val[0])); |
| | _r2 = vcombine_f32(vget_high_f32(_r01z.val[0]), vget_high_f32(_r23z.val[0])); |
| | _r3 = vcombine_f32(vget_high_f32(_r45z.val[0]), vget_high_f32(_r67z.val[0])); |
| | _r4 = vcombine_f32(vget_low_f32(_r01z.val[1]), vget_low_f32(_r23z.val[1])); |
| | _r5 = vcombine_f32(vget_low_f32(_r45z.val[1]), vget_low_f32(_r67z.val[1])); |
| | _r6 = vcombine_f32(vget_high_f32(_r01z.val[1]), vget_high_f32(_r23z.val[1])); |
| | _r7 = vcombine_f32(vget_high_f32(_r45z.val[1]), vget_high_f32(_r67z.val[1])); |
| | } |
| |
|
| | static inline void transpose4x12_ps(float32x4_t& _r0, float32x4_t& _r1, float32x4_t& _r2, float32x4_t& _r3, float32x4_t& _r4, float32x4_t& _r5, float32x4_t& _r6, float32x4_t& _r7, float32x4_t& _r8, float32x4_t& _r9, float32x4_t& _ra, float32x4_t& _rb) |
| | { |
| | float32x4x2_t _r01z = vzipq_f32(_r0, _r1); |
| | float32x4x2_t _r23z = vzipq_f32(_r2, _r3); |
| | float32x4x2_t _r45z = vzipq_f32(_r4, _r5); |
| | float32x4x2_t _r67z = vzipq_f32(_r6, _r7); |
| | float32x4x2_t _r89z = vzipq_f32(_r8, _r9); |
| | float32x4x2_t _rabz = vzipq_f32(_ra, _rb); |
| | _r0 = vcombine_f32(vget_low_f32(_r01z.val[0]), vget_low_f32(_r23z.val[0])); |
| | _r1 = vcombine_f32(vget_low_f32(_r45z.val[0]), vget_low_f32(_r67z.val[0])); |
| | _r2 = vcombine_f32(vget_low_f32(_r89z.val[0]), vget_low_f32(_rabz.val[0])); |
| | _r3 = vcombine_f32(vget_high_f32(_r01z.val[0]), vget_high_f32(_r23z.val[0])); |
| | _r4 = vcombine_f32(vget_high_f32(_r45z.val[0]), vget_high_f32(_r67z.val[0])); |
| | _r5 = vcombine_f32(vget_high_f32(_r89z.val[0]), vget_high_f32(_rabz.val[0])); |
| | _r6 = vcombine_f32(vget_low_f32(_r01z.val[1]), vget_low_f32(_r23z.val[1])); |
| | _r7 = vcombine_f32(vget_low_f32(_r45z.val[1]), vget_low_f32(_r67z.val[1])); |
| | _r8 = vcombine_f32(vget_low_f32(_r89z.val[1]), vget_low_f32(_rabz.val[1])); |
| | _r9 = vcombine_f32(vget_high_f32(_r01z.val[1]), vget_high_f32(_r23z.val[1])); |
| | _ra = vcombine_f32(vget_high_f32(_r45z.val[1]), vget_high_f32(_r67z.val[1])); |
| | _rb = vcombine_f32(vget_high_f32(_r89z.val[1]), vget_high_f32(_rabz.val[1])); |
| | } |
| |
|
| | static inline void transpose8x4_ps(float32x4_t& _r0l, float32x4_t& _r0h, |
| | float32x4_t& _r1l, float32x4_t& _r1h, |
| | float32x4_t& _r2l, float32x4_t& _r2h, |
| | float32x4_t& _r3l, float32x4_t& _r3h) |
| | { |
| | float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l); |
| | float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l); |
| | float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h); |
| | float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h); |
| | _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0])); |
| | _r0h = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0])); |
| | _r1l = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1])); |
| | _r1h = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1])); |
| | _r2l = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0])); |
| | _r2h = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0])); |
| | _r3l = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1])); |
| | _r3h = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1])); |
| | } |
| |
|
| | static inline void transpose12x4_ps(float32x4_t& _r0l, float32x4_t& _r0m, float32x4_t& _r0h, |
| | float32x4_t& _r1l, float32x4_t& _r1m, float32x4_t& _r1h, |
| | float32x4_t& _r2l, float32x4_t& _r2m, float32x4_t& _r2h, |
| | float32x4_t& _r3l, float32x4_t& _r3m, float32x4_t& _r3h) |
| | { |
| | float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l); |
| | float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l); |
| | float32x4x2_t _r01mz = vzipq_f32(_r0m, _r1m); |
| | float32x4x2_t _r23mz = vzipq_f32(_r2m, _r3m); |
| | float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h); |
| | float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h); |
| | _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0])); |
| | _r0m = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0])); |
| | _r0h = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1])); |
| | _r1l = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1])); |
| | _r1m = vcombine_f32(vget_low_f32(_r01mz.val[0]), vget_low_f32(_r23mz.val[0])); |
| | _r1h = vcombine_f32(vget_high_f32(_r01mz.val[0]), vget_high_f32(_r23mz.val[0])); |
| | _r2l = vcombine_f32(vget_low_f32(_r01mz.val[1]), vget_low_f32(_r23mz.val[1])); |
| | _r2m = vcombine_f32(vget_high_f32(_r01mz.val[1]), vget_high_f32(_r23mz.val[1])); |
| | _r2h = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0])); |
| | _r3l = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0])); |
| | _r3m = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1])); |
| | _r3h = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1])); |
| | } |
| |
|
| | #if __aarch64__ |
| | static inline void transpose8x8_ps(float32x4_t& _r0l, float32x4_t& _r0h, |
| | float32x4_t& _r1l, float32x4_t& _r1h, |
| | float32x4_t& _r2l, float32x4_t& _r2h, |
| | float32x4_t& _r3l, float32x4_t& _r3h, |
| | float32x4_t& _r4l, float32x4_t& _r4h, |
| | float32x4_t& _r5l, float32x4_t& _r5h, |
| | float32x4_t& _r6l, float32x4_t& _r6h, |
| | float32x4_t& _r7l, float32x4_t& _r7h) |
| | { |
| | float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l); |
| | float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l); |
| | float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h); |
| | float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h); |
| | float32x4x2_t _r45lz = vzipq_f32(_r4l, _r5l); |
| | float32x4x2_t _r67lz = vzipq_f32(_r6l, _r7l); |
| | float32x4x2_t _r45hz = vzipq_f32(_r4h, _r5h); |
| | float32x4x2_t _r67hz = vzipq_f32(_r6h, _r7h); |
| | _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0])); |
| | _r0h = vcombine_f32(vget_low_f32(_r45lz.val[0]), vget_low_f32(_r67lz.val[0])); |
| | _r1l = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0])); |
| | _r1h = vcombine_f32(vget_high_f32(_r45lz.val[0]), vget_high_f32(_r67lz.val[0])); |
| | _r2l = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1])); |
| | _r2h = vcombine_f32(vget_low_f32(_r45lz.val[1]), vget_low_f32(_r67lz.val[1])); |
| | _r3l = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1])); |
| | _r3h = vcombine_f32(vget_high_f32(_r45lz.val[1]), vget_high_f32(_r67lz.val[1])); |
| | _r4l = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0])); |
| | _r4h = vcombine_f32(vget_low_f32(_r45hz.val[0]), vget_low_f32(_r67hz.val[0])); |
| | _r5l = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0])); |
| | _r5h = vcombine_f32(vget_high_f32(_r45hz.val[0]), vget_high_f32(_r67hz.val[0])); |
| | _r6l = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1])); |
| | _r6h = vcombine_f32(vget_low_f32(_r45hz.val[1]), vget_low_f32(_r67hz.val[1])); |
| | _r7l = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1])); |
| | _r7h = vcombine_f32(vget_high_f32(_r45hz.val[1]), vget_high_f32(_r67hz.val[1])); |
| | } |
| |
|
| | static inline void transpose8x12_ps(float32x4_t& _r0l, float32x4_t& _r0h, |
| | float32x4_t& _r1l, float32x4_t& _r1h, |
| | float32x4_t& _r2l, float32x4_t& _r2h, |
| | float32x4_t& _r3l, float32x4_t& _r3h, |
| | float32x4_t& _r4l, float32x4_t& _r4h, |
| | float32x4_t& _r5l, float32x4_t& _r5h, |
| | float32x4_t& _r6l, float32x4_t& _r6h, |
| | float32x4_t& _r7l, float32x4_t& _r7h, |
| | float32x4_t& _r8l, float32x4_t& _r8h, |
| | float32x4_t& _r9l, float32x4_t& _r9h, |
| | float32x4_t& _ral, float32x4_t& _rah, |
| | float32x4_t& _rbl, float32x4_t& _rbh) |
| | { |
| | float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l); |
| | float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l); |
| | float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h); |
| | float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h); |
| | float32x4x2_t _r45lz = vzipq_f32(_r4l, _r5l); |
| | float32x4x2_t _r67lz = vzipq_f32(_r6l, _r7l); |
| | float32x4x2_t _r45hz = vzipq_f32(_r4h, _r5h); |
| | float32x4x2_t _r67hz = vzipq_f32(_r6h, _r7h); |
| | float32x4x2_t _r89lz = vzipq_f32(_r8l, _r9l); |
| | float32x4x2_t _rablz = vzipq_f32(_ral, _rbl); |
| | float32x4x2_t _r89hz = vzipq_f32(_r8h, _r9h); |
| | float32x4x2_t _rabhz = vzipq_f32(_rah, _rbh); |
| | _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0])); |
| | _r0h = vcombine_f32(vget_low_f32(_r45lz.val[0]), vget_low_f32(_r67lz.val[0])); |
| | _r1l = vcombine_f32(vget_low_f32(_r89lz.val[0]), vget_low_f32(_rablz.val[0])); |
| | _r1h = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0])); |
| | _r2l = vcombine_f32(vget_high_f32(_r45lz.val[0]), vget_high_f32(_r67lz.val[0])); |
| | _r2h = vcombine_f32(vget_high_f32(_r89lz.val[0]), vget_high_f32(_rablz.val[0])); |
| | _r3l = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1])); |
| | _r3h = vcombine_f32(vget_low_f32(_r45lz.val[1]), vget_low_f32(_r67lz.val[1])); |
| | _r4l = vcombine_f32(vget_low_f32(_r89lz.val[1]), vget_low_f32(_rablz.val[1])); |
| | _r4h = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1])); |
| | _r5l = vcombine_f32(vget_high_f32(_r45lz.val[1]), vget_high_f32(_r67lz.val[1])); |
| | _r5h = vcombine_f32(vget_high_f32(_r89lz.val[1]), vget_high_f32(_rablz.val[1])); |
| | _r6l = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0])); |
| | _r6h = vcombine_f32(vget_low_f32(_r45hz.val[0]), vget_low_f32(_r67hz.val[0])); |
| | _r7l = vcombine_f32(vget_low_f32(_r89hz.val[0]), vget_low_f32(_rabhz.val[0])); |
| | _r7h = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0])); |
| | _r8l = vcombine_f32(vget_high_f32(_r45hz.val[0]), vget_high_f32(_r67hz.val[0])); |
| | _r8h = vcombine_f32(vget_high_f32(_r89hz.val[0]), vget_high_f32(_rabhz.val[0])); |
| | _r9l = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1])); |
| | _r9h = vcombine_f32(vget_low_f32(_r45hz.val[1]), vget_low_f32(_r67hz.val[1])); |
| | _ral = vcombine_f32(vget_low_f32(_r89hz.val[1]), vget_low_f32(_rabhz.val[1])); |
| | _rah = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1])); |
| | _rbl = vcombine_f32(vget_high_f32(_r45hz.val[1]), vget_high_f32(_r67hz.val[1])); |
| | _rbh = vcombine_f32(vget_high_f32(_r89hz.val[1]), vget_high_f32(_rabhz.val[1])); |
| | } |
| |
|
| | static inline void transpose12x8_ps(float32x4_t& _r0l, float32x4_t& _r0m, float32x4_t& _r0h, |
| | float32x4_t& _r1l, float32x4_t& _r1m, float32x4_t& _r1h, |
| | float32x4_t& _r2l, float32x4_t& _r2m, float32x4_t& _r2h, |
| | float32x4_t& _r3l, float32x4_t& _r3m, float32x4_t& _r3h, |
| | float32x4_t& _r4l, float32x4_t& _r4m, float32x4_t& _r4h, |
| | float32x4_t& _r5l, float32x4_t& _r5m, float32x4_t& _r5h, |
| | float32x4_t& _r6l, float32x4_t& _r6m, float32x4_t& _r6h, |
| | float32x4_t& _r7l, float32x4_t& _r7m, float32x4_t& _r7h) |
| | { |
| | float32x4x2_t _r01lz = vzipq_f32(_r0l, _r1l); |
| | float32x4x2_t _r23lz = vzipq_f32(_r2l, _r3l); |
| | float32x4x2_t _r01mz = vzipq_f32(_r0m, _r1m); |
| | float32x4x2_t _r23mz = vzipq_f32(_r2m, _r3m); |
| | float32x4x2_t _r01hz = vzipq_f32(_r0h, _r1h); |
| | float32x4x2_t _r23hz = vzipq_f32(_r2h, _r3h); |
| | float32x4x2_t _r45lz = vzipq_f32(_r4l, _r5l); |
| | float32x4x2_t _r67lz = vzipq_f32(_r6l, _r7l); |
| | float32x4x2_t _r45mz = vzipq_f32(_r4m, _r5m); |
| | float32x4x2_t _r67mz = vzipq_f32(_r6m, _r7m); |
| | float32x4x2_t _r45hz = vzipq_f32(_r4h, _r5h); |
| | float32x4x2_t _r67hz = vzipq_f32(_r6h, _r7h); |
| | _r0l = vcombine_f32(vget_low_f32(_r01lz.val[0]), vget_low_f32(_r23lz.val[0])); |
| | _r0m = vcombine_f32(vget_low_f32(_r45lz.val[0]), vget_low_f32(_r67lz.val[0])); |
| | _r0h = vcombine_f32(vget_high_f32(_r01lz.val[0]), vget_high_f32(_r23lz.val[0])); |
| | _r1l = vcombine_f32(vget_high_f32(_r45lz.val[0]), vget_high_f32(_r67lz.val[0])); |
| | _r1m = vcombine_f32(vget_low_f32(_r01lz.val[1]), vget_low_f32(_r23lz.val[1])); |
| | _r1h = vcombine_f32(vget_low_f32(_r45lz.val[1]), vget_low_f32(_r67lz.val[1])); |
| | _r2l = vcombine_f32(vget_high_f32(_r01lz.val[1]), vget_high_f32(_r23lz.val[1])); |
| | _r2m = vcombine_f32(vget_high_f32(_r45lz.val[1]), vget_high_f32(_r67lz.val[1])); |
| | _r2h = vcombine_f32(vget_low_f32(_r01mz.val[0]), vget_low_f32(_r23mz.val[0])); |
| | _r3l = vcombine_f32(vget_low_f32(_r45mz.val[0]), vget_low_f32(_r67mz.val[0])); |
| | _r3m = vcombine_f32(vget_high_f32(_r01mz.val[0]), vget_high_f32(_r23mz.val[0])); |
| | _r3h = vcombine_f32(vget_high_f32(_r45mz.val[0]), vget_high_f32(_r67mz.val[0])); |
| | _r4l = vcombine_f32(vget_low_f32(_r01mz.val[1]), vget_low_f32(_r23mz.val[1])); |
| | _r4m = vcombine_f32(vget_low_f32(_r45mz.val[1]), vget_low_f32(_r67mz.val[1])); |
| | _r4h = vcombine_f32(vget_high_f32(_r01mz.val[1]), vget_high_f32(_r23mz.val[1])); |
| | _r5l = vcombine_f32(vget_high_f32(_r45mz.val[1]), vget_high_f32(_r67mz.val[1])); |
| | _r5m = vcombine_f32(vget_low_f32(_r01hz.val[0]), vget_low_f32(_r23hz.val[0])); |
| | _r5h = vcombine_f32(vget_low_f32(_r45hz.val[0]), vget_low_f32(_r67hz.val[0])); |
| | _r6l = vcombine_f32(vget_high_f32(_r01hz.val[0]), vget_high_f32(_r23hz.val[0])); |
| | _r6m = vcombine_f32(vget_high_f32(_r45hz.val[0]), vget_high_f32(_r67hz.val[0])); |
| | _r6h = vcombine_f32(vget_low_f32(_r01hz.val[1]), vget_low_f32(_r23hz.val[1])); |
| | _r7l = vcombine_f32(vget_low_f32(_r45hz.val[1]), vget_low_f32(_r67hz.val[1])); |
| | _r7m = vcombine_f32(vget_high_f32(_r01hz.val[1]), vget_high_f32(_r23hz.val[1])); |
| | _r7h = vcombine_f32(vget_high_f32(_r45hz.val[1]), vget_high_f32(_r67hz.val[1])); |
| | } |
| |
|
| | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC |
| | static inline void transpose4x4_ph(float16x4_t& _r0, float16x4_t& _r1, float16x4_t& _r2, float16x4_t& _r3) |
| | { |
| | uint16x4x2_t _r01z = vzip_u16(vreinterpret_u16_f16(_r0), vreinterpret_u16_f16(_r1)); |
| | uint16x4x2_t _r23z = vzip_u16(vreinterpret_u16_f16(_r2), vreinterpret_u16_f16(_r3)); |
| | uint32x2x2_t _r01 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0])); |
| | uint32x2x2_t _r23 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1])); |
| | _r0 = vreinterpret_f16_u32(_r01.val[0]); |
| | _r1 = vreinterpret_f16_u32(_r01.val[1]); |
| | _r2 = vreinterpret_f16_u32(_r23.val[0]); |
| | _r3 = vreinterpret_f16_u32(_r23.val[1]); |
| | } |
| |
|
| | static inline void transpose4x8_ph(float16x4_t& _r0, float16x4_t& _r1, float16x4_t& _r2, float16x4_t& _r3, float16x4_t& _r4, float16x4_t& _r5, float16x4_t& _r6, float16x4_t& _r7) |
| | { |
| | uint16x4x2_t _r01z = vzip_u16(vreinterpret_u16_f16(_r0), vreinterpret_u16_f16(_r1)); |
| | uint16x4x2_t _r23z = vzip_u16(vreinterpret_u16_f16(_r2), vreinterpret_u16_f16(_r3)); |
| | uint16x4x2_t _r45z = vzip_u16(vreinterpret_u16_f16(_r4), vreinterpret_u16_f16(_r5)); |
| | uint16x4x2_t _r67z = vzip_u16(vreinterpret_u16_f16(_r6), vreinterpret_u16_f16(_r7)); |
| | uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0])); |
| | uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1])); |
| | uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0])); |
| | uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1])); |
| | _r0 = vreinterpret_f16_u32(_r01_0.val[0]); |
| | _r1 = vreinterpret_f16_u32(_r01_1.val[0]); |
| | _r2 = vreinterpret_f16_u32(_r01_0.val[1]); |
| | _r3 = vreinterpret_f16_u32(_r01_1.val[1]); |
| | _r4 = vreinterpret_f16_u32(_r23_0.val[0]); |
| | _r5 = vreinterpret_f16_u32(_r23_1.val[0]); |
| | _r6 = vreinterpret_f16_u32(_r23_0.val[1]); |
| | _r7 = vreinterpret_f16_u32(_r23_1.val[1]); |
| | } |
| |
|
| | static inline void transpose4x12_ph(float16x4_t& _r0, float16x4_t& _r1, float16x4_t& _r2, float16x4_t& _r3, float16x4_t& _r4, float16x4_t& _r5, float16x4_t& _r6, float16x4_t& _r7, float16x4_t& _r8, float16x4_t& _r9, float16x4_t& _ra, float16x4_t& _rb) |
| | { |
| | uint16x4x2_t _r01z = vzip_u16(vreinterpret_u16_f16(_r0), vreinterpret_u16_f16(_r1)); |
| | uint16x4x2_t _r23z = vzip_u16(vreinterpret_u16_f16(_r2), vreinterpret_u16_f16(_r3)); |
| | uint16x4x2_t _r45z = vzip_u16(vreinterpret_u16_f16(_r4), vreinterpret_u16_f16(_r5)); |
| | uint16x4x2_t _r67z = vzip_u16(vreinterpret_u16_f16(_r6), vreinterpret_u16_f16(_r7)); |
| | uint16x4x2_t _r89z = vzip_u16(vreinterpret_u16_f16(_r8), vreinterpret_u16_f16(_r9)); |
| | uint16x4x2_t _rabz = vzip_u16(vreinterpret_u16_f16(_ra), vreinterpret_u16_f16(_rb)); |
| | uint32x2x2_t _r01_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[0]), vreinterpret_u32_u16(_r23z.val[0])); |
| | uint32x2x2_t _r23_0 = vzip_u32(vreinterpret_u32_u16(_r01z.val[1]), vreinterpret_u32_u16(_r23z.val[1])); |
| | uint32x2x2_t _r01_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[0]), vreinterpret_u32_u16(_r67z.val[0])); |
| | uint32x2x2_t _r23_1 = vzip_u32(vreinterpret_u32_u16(_r45z.val[1]), vreinterpret_u32_u16(_r67z.val[1])); |
| | uint32x2x2_t _r01_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[0]), vreinterpret_u32_u16(_rabz.val[0])); |
| | uint32x2x2_t _r23_2 = vzip_u32(vreinterpret_u32_u16(_r89z.val[1]), vreinterpret_u32_u16(_rabz.val[1])); |
| | _r0 = vreinterpret_f16_u32(_r01_0.val[0]); |
| | _r1 = vreinterpret_f16_u32(_r01_1.val[0]); |
| | _r2 = vreinterpret_f16_u32(_r01_2.val[0]); |
| | _r3 = vreinterpret_f16_u32(_r01_0.val[1]); |
| | _r4 = vreinterpret_f16_u32(_r01_1.val[1]); |
| | _r5 = vreinterpret_f16_u32(_r01_2.val[1]); |
| | _r6 = vreinterpret_f16_u32(_r23_0.val[0]); |
| | _r7 = vreinterpret_f16_u32(_r23_1.val[0]); |
| | _r8 = vreinterpret_f16_u32(_r23_2.val[0]); |
| | _r9 = vreinterpret_f16_u32(_r23_0.val[1]); |
| | _ra = vreinterpret_f16_u32(_r23_1.val[1]); |
| | _rb = vreinterpret_f16_u32(_r23_2.val[1]); |
| | } |
| |
|
| | static inline void transpose8x4_ph(float16x8_t& _r0, float16x8_t& _r1, float16x8_t& _r2, float16x8_t& _r3) |
| | { |
| | uint16x8x2_t _r01t = vzipq_u16(vreinterpretq_u16_f16(_r0), vreinterpretq_u16_f16(_r1)); |
| | uint16x8x2_t _r23t = vzipq_u16(vreinterpretq_u16_f16(_r2), vreinterpretq_u16_f16(_r3)); |
| | uint32x4x2_t _r01 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0])); |
| | uint32x4x2_t _r23 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1])); |
| | _r0 = vreinterpretq_f16_u32(_r01.val[0]); |
| | _r1 = vreinterpretq_f16_u32(_r01.val[1]); |
| | _r2 = vreinterpretq_f16_u32(_r23.val[0]); |
| | _r3 = vreinterpretq_f16_u32(_r23.val[1]); |
| | } |
| |
|
| | static inline void transpose8x8_ph(float16x8_t& _r0, float16x8_t& _r1, float16x8_t& _r2, float16x8_t& _r3, float16x8_t& _r4, float16x8_t& _r5, float16x8_t& _r6, float16x8_t& _r7) |
| | { |
| | uint16x8x2_t _r01t = vzipq_u16(vreinterpretq_u16_f16(_r0), vreinterpretq_u16_f16(_r1)); |
| | uint16x8x2_t _r23t = vzipq_u16(vreinterpretq_u16_f16(_r2), vreinterpretq_u16_f16(_r3)); |
| | uint16x8x2_t _r45t = vzipq_u16(vreinterpretq_u16_f16(_r4), vreinterpretq_u16_f16(_r5)); |
| | uint16x8x2_t _r67t = vzipq_u16(vreinterpretq_u16_f16(_r6), vreinterpretq_u16_f16(_r7)); |
| | uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0])); |
| | uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1])); |
| | uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0])); |
| | uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1])); |
| | _r0 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0]))); |
| | _r1 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_0.val[0]), vget_high_u32(_r01_1.val[0]))); |
| | _r2 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1]))); |
| | _r3 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_0.val[1]), vget_high_u32(_r01_1.val[1]))); |
| | _r4 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0]))); |
| | _r5 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_0.val[0]), vget_high_u32(_r23_1.val[0]))); |
| | _r6 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1]))); |
| | _r7 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_0.val[1]), vget_high_u32(_r23_1.val[1]))); |
| | } |
| |
|
| | static inline void transpose8x12_ph(float16x8_t& _r0, float16x8_t& _r1, float16x8_t& _r2, float16x8_t& _r3, float16x8_t& _r4, float16x8_t& _r5, float16x8_t& _r6, float16x8_t& _r7, float16x8_t& _r8, float16x8_t& _r9, float16x8_t& _ra, float16x8_t& _rb) |
| | { |
| | uint16x8x2_t _r01t = vzipq_u16(vreinterpretq_u16_f16(_r0), vreinterpretq_u16_f16(_r1)); |
| | uint16x8x2_t _r23t = vzipq_u16(vreinterpretq_u16_f16(_r2), vreinterpretq_u16_f16(_r3)); |
| | uint16x8x2_t _r45t = vzipq_u16(vreinterpretq_u16_f16(_r4), vreinterpretq_u16_f16(_r5)); |
| | uint16x8x2_t _r67t = vzipq_u16(vreinterpretq_u16_f16(_r6), vreinterpretq_u16_f16(_r7)); |
| | uint16x8x2_t _r89t = vzipq_u16(vreinterpretq_u16_f16(_r8), vreinterpretq_u16_f16(_r9)); |
| | uint16x8x2_t _rabt = vzipq_u16(vreinterpretq_u16_f16(_ra), vreinterpretq_u16_f16(_rb)); |
| | uint32x4x2_t _r01_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[0]), vreinterpretq_u32_u16(_r23t.val[0])); |
| | uint32x4x2_t _r23_0 = vzipq_u32(vreinterpretq_u32_u16(_r01t.val[1]), vreinterpretq_u32_u16(_r23t.val[1])); |
| | uint32x4x2_t _r01_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[0]), vreinterpretq_u32_u16(_r67t.val[0])); |
| | uint32x4x2_t _r23_1 = vzipq_u32(vreinterpretq_u32_u16(_r45t.val[1]), vreinterpretq_u32_u16(_r67t.val[1])); |
| | uint32x4x2_t _r01_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[0]), vreinterpretq_u32_u16(_rabt.val[0])); |
| | uint32x4x2_t _r23_2 = vzipq_u32(vreinterpretq_u32_u16(_r89t.val[1]), vreinterpretq_u32_u16(_rabt.val[1])); |
| | _r0 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[0]), vget_low_u32(_r01_1.val[0]))); |
| | _r1 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_2.val[0]), vget_high_u32(_r01_0.val[0]))); |
| | _r2 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_1.val[0]), vget_high_u32(_r01_2.val[0]))); |
| | _r3 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_0.val[1]), vget_low_u32(_r01_1.val[1]))); |
| | _r4 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r01_2.val[1]), vget_high_u32(_r01_0.val[1]))); |
| | _r5 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r01_1.val[1]), vget_high_u32(_r01_2.val[1]))); |
| | _r6 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[0]), vget_low_u32(_r23_1.val[0]))); |
| | _r7 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_2.val[0]), vget_high_u32(_r23_0.val[0]))); |
| | _r8 = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_1.val[0]), vget_high_u32(_r23_2.val[0]))); |
| | _r9 = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_0.val[1]), vget_low_u32(_r23_1.val[1]))); |
| | _ra = vreinterpretq_f16_u32(vcombine_u32(vget_low_u32(_r23_2.val[1]), vget_high_u32(_r23_0.val[1]))); |
| | _rb = vreinterpretq_f16_u32(vcombine_u32(vget_high_u32(_r23_1.val[1]), vget_high_u32(_r23_2.val[1]))); |
| | } |
| |
|
| | static inline void transpose12x4_ph(float16x4_t& _r0l, float16x4_t& _r0m, float16x4_t& _r0h, |
| | float16x4_t& _r1l, float16x4_t& _r1m, float16x4_t& _r1h, |
| | float16x4_t& _r2l, float16x4_t& _r2m, float16x4_t& _r2h, |
| | float16x4_t& _r3l, float16x4_t& _r3m, float16x4_t& _r3h) |
| | { |
| | uint16x4x2_t _r01lz = vzip_u16(vreinterpret_u16_f16(_r0l), vreinterpret_u16_f16(_r1l)); |
| | uint16x4x2_t _r23lz = vzip_u16(vreinterpret_u16_f16(_r2l), vreinterpret_u16_f16(_r3l)); |
| | uint16x4x2_t _r01mz = vzip_u16(vreinterpret_u16_f16(_r0m), vreinterpret_u16_f16(_r1m)); |
| | uint16x4x2_t _r23mz = vzip_u16(vreinterpret_u16_f16(_r2m), vreinterpret_u16_f16(_r3m)); |
| | uint16x4x2_t _r01hz = vzip_u16(vreinterpret_u16_f16(_r0h), vreinterpret_u16_f16(_r1h)); |
| | uint16x4x2_t _r23hz = vzip_u16(vreinterpret_u16_f16(_r2h), vreinterpret_u16_f16(_r3h)); |
| | uint32x2x2_t _r01 = vzip_u32(vreinterpret_u32_u16(_r01lz.val[0]), vreinterpret_u32_u16(_r23lz.val[0])); |
| | uint32x2x2_t _r23 = vzip_u32(vreinterpret_u32_u16(_r01lz.val[1]), vreinterpret_u32_u16(_r23lz.val[1])); |
| | uint32x2x2_t _r45 = vzip_u32(vreinterpret_u32_u16(_r01mz.val[0]), vreinterpret_u32_u16(_r23mz.val[0])); |
| | uint32x2x2_t _r67 = vzip_u32(vreinterpret_u32_u16(_r01mz.val[1]), vreinterpret_u32_u16(_r23mz.val[1])); |
| | uint32x2x2_t _r89 = vzip_u32(vreinterpret_u32_u16(_r01hz.val[0]), vreinterpret_u32_u16(_r23hz.val[0])); |
| | uint32x2x2_t _rab = vzip_u32(vreinterpret_u32_u16(_r01hz.val[1]), vreinterpret_u32_u16(_r23hz.val[1])); |
| | _r0l = vreinterpret_f16_u32(_r01.val[0]); |
| | _r0m = vreinterpret_f16_u32(_r01.val[1]); |
| | _r0h = vreinterpret_f16_u32(_r23.val[0]); |
| | _r1l = vreinterpret_f16_u32(_r23.val[1]); |
| | _r1m = vreinterpret_f16_u32(_r45.val[0]); |
| | _r1h = vreinterpret_f16_u32(_r45.val[1]); |
| | _r2l = vreinterpret_f16_u32(_r67.val[0]); |
| | _r2m = vreinterpret_f16_u32(_r67.val[1]); |
| | _r2h = vreinterpret_f16_u32(_r89.val[0]); |
| | _r3l = vreinterpret_f16_u32(_r89.val[1]); |
| | _r3m = vreinterpret_f16_u32(_rab.val[0]); |
| | _r3h = vreinterpret_f16_u32(_rab.val[1]); |
| | } |
| | #endif |
| |
|
| | #endif |
| | #endif |
| |
|
| | #endif |
| |
|