// Definition of the public simd interfaces -*- C++ -*- // Copyright (C) 2020-2021 Free Software Foundation, Inc. // // This file is part of the GNU ISO C++ Library. This library is free // software; you can redistribute it and/or modify it under the // terms of the GNU General Public License as published by the // Free Software Foundation; either version 3, or (at your option) // any later version. // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // Under Section 7 of GPL version 3, you are granted additional // permissions described in the GCC Runtime Library Exception, version // 3.1, as published by the Free Software Foundation. // You should have received a copy of the GNU General Public License and // a copy of the GCC Runtime Library Exception along with this program; // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see // . #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_H #define _GLIBCXX_EXPERIMENTAL_SIMD_H #if __cplusplus >= 201703L #include "simd_detail.h" #include "numeric_traits.h" #include #include #ifdef _GLIBCXX_DEBUG_UB #include // for stderr #endif #include #include #include #include #if _GLIBCXX_SIMD_X86INTRIN #include #elif _GLIBCXX_SIMD_HAVE_NEON #include #endif /* There are several closely related types, with the following naming * convention: * _Tp: vectorizable (arithmetic) type (or any type) * _TV: __vector_type_t<_Tp, _Np> * _TW: _SimdWrapper<_Tp, _Np> * _TI: __intrinsic_type_t<_Tp, _Np> * _TVT: _VectorTraits<_TV> or _VectorTraits<_TW> * If one additional type is needed use _U instead of _T. * Otherwise use _T\d, _TV\d, _TW\d, TI\d, _TVT\d. * * More naming conventions: * _Ap or _Abi: An ABI tag from the simd_abi namespace * _Ip: often used for integer types with sizeof(_Ip) == sizeof(_Tp), * _IV, _IW as for _TV, _TW * _Np: number of elements (not bytes) * _Bytes: number of bytes * * Variable names: * __k: mask object (vector- or bitmask) */ _GLIBCXX_SIMD_BEGIN_NAMESPACE #if !_GLIBCXX_SIMD_X86INTRIN using __m128 [[__gnu__::__vector_size__(16)]] = float; using __m128d [[__gnu__::__vector_size__(16)]] = double; using __m128i [[__gnu__::__vector_size__(16)]] = long long; using __m256 [[__gnu__::__vector_size__(32)]] = float; using __m256d [[__gnu__::__vector_size__(32)]] = double; using __m256i [[__gnu__::__vector_size__(32)]] = long long; using __m512 [[__gnu__::__vector_size__(64)]] = float; using __m512d [[__gnu__::__vector_size__(64)]] = double; using __m512i [[__gnu__::__vector_size__(64)]] = long long; #endif namespace simd_abi { // simd_abi forward declarations {{{ // implementation details: struct _Scalar; template struct _Fixed; // There are two major ABIs that appear on different architectures. // Both have non-boolean values packed into an N Byte register // -> #elements = N / sizeof(T) // Masks differ: // 1. Use value vector registers for masks (all 0 or all 1) // 2. Use bitmasks (mask registers) with one bit per value in the corresponding // value vector // // Both can be partially used, masking off the rest when doing horizontal // operations or operations that can trap (e.g. FP_INVALID or integer division // by 0). This is encoded as the number of used bytes. template struct _VecBuiltin; template struct _VecBltnBtmsk; template using _VecN = _VecBuiltin; template using _Sse = _VecBuiltin<_UsedBytes>; template using _Avx = _VecBuiltin<_UsedBytes>; template using _Avx512 = _VecBltnBtmsk<_UsedBytes>; template using _Neon = _VecBuiltin<_UsedBytes>; // implementation-defined: using __sse = _Sse<>; using __avx = _Avx<>; using __avx512 = _Avx512<>; using __neon = _Neon<>; using __neon128 = _Neon<16>; using __neon64 = _Neon<8>; // standard: template struct deduce; template using fixed_size = _Fixed<_Np>; using scalar = _Scalar; // }}} } // namespace simd_abi // forward declarations is_simd(_mask), simd(_mask), simd_size {{{ template struct is_simd; template struct is_simd_mask; template class simd; template class simd_mask; template struct simd_size; // }}} // load/store flags {{{ struct element_aligned_tag { template static constexpr size_t _S_alignment = alignof(_Up); template _GLIBCXX_SIMD_INTRINSIC static constexpr _Up* _S_apply(_Up* __ptr) { return __ptr; } }; struct vector_aligned_tag { template static constexpr size_t _S_alignment = std::__bit_ceil(sizeof(_Up) * _Tp::size()); template _GLIBCXX_SIMD_INTRINSIC static constexpr _Up* _S_apply(_Up* __ptr) { return static_cast<_Up*>( __builtin_assume_aligned(__ptr, _S_alignment<_Tp, _Up>)); } }; template struct overaligned_tag { template static constexpr size_t _S_alignment = _Np; template _GLIBCXX_SIMD_INTRINSIC static constexpr _Up* _S_apply(_Up* __ptr) { return static_cast<_Up*>(__builtin_assume_aligned(__ptr, _Np)); } }; inline constexpr element_aligned_tag element_aligned = {}; inline constexpr vector_aligned_tag vector_aligned = {}; template inline constexpr overaligned_tag<_Np> overaligned = {}; // }}} template using _SizeConstant = integral_constant; namespace __detail { struct _Minimum { template _GLIBCXX_SIMD_INTRINSIC constexpr _Tp operator()(_Tp __a, _Tp __b) const { using std::min; return min(__a, __b); } }; struct _Maximum { template _GLIBCXX_SIMD_INTRINSIC constexpr _Tp operator()(_Tp __a, _Tp __b) const { using std::max; return max(__a, __b); } }; } // namespace __detail // unrolled/pack execution helpers // __execute_n_times{{{ template _GLIBCXX_SIMD_INTRINSIC constexpr void __execute_on_index_sequence(_Fp&& __f, index_sequence<_I...>) { ((void)__f(_SizeConstant<_I>()), ...); } template _GLIBCXX_SIMD_INTRINSIC constexpr void __execute_on_index_sequence(_Fp&&, index_sequence<>) { } template _GLIBCXX_SIMD_INTRINSIC constexpr void __execute_n_times(_Fp&& __f) { __execute_on_index_sequence(static_cast<_Fp&&>(__f), make_index_sequence<_Np>{}); } // }}} // __generate_from_n_evaluations{{{ template _GLIBCXX_SIMD_INTRINSIC constexpr _R __execute_on_index_sequence_with_return(_Fp&& __f, index_sequence<_I...>) { return _R{__f(_SizeConstant<_I>())...}; } template _GLIBCXX_SIMD_INTRINSIC constexpr _R __generate_from_n_evaluations(_Fp&& __f) { return __execute_on_index_sequence_with_return<_R>( static_cast<_Fp&&>(__f), make_index_sequence<_Np>{}); } // }}} // __call_with_n_evaluations{{{ template _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_n_evaluations(index_sequence<_I...>, _F0&& __f0, _FArgs&& __fargs) { return __f0(__fargs(_SizeConstant<_I>())...); } template _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_n_evaluations(_F0&& __f0, _FArgs&& __fargs) { return __call_with_n_evaluations(make_index_sequence<_Np>{}, static_cast<_F0&&>(__f0), static_cast<_FArgs&&>(__fargs)); } // }}} // __call_with_subscripts{{{ template _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_subscripts(_Tp&& __x, index_sequence<_It...>, _Fp&& __fun) { return __fun(__x[_First + _It]...); } template _GLIBCXX_SIMD_INTRINSIC constexpr auto __call_with_subscripts(_Tp&& __x, _Fp&& __fun) { return __call_with_subscripts<_First>(static_cast<_Tp&&>(__x), make_index_sequence<_Np>(), static_cast<_Fp&&>(__fun)); } // }}} // vvv ---- type traits ---- vvv // integer type aliases{{{ using _UChar = unsigned char; using _SChar = signed char; using _UShort = unsigned short; using _UInt = unsigned int; using _ULong = unsigned long; using _ULLong = unsigned long long; using _LLong = long long; //}}} // __first_of_pack{{{ template struct __first_of_pack { using type = _T0; }; template using __first_of_pack_t = typename __first_of_pack<_Ts...>::type; //}}} // __value_type_or_identity_t {{{ template typename _Tp::value_type __value_type_or_identity_impl(int); template _Tp __value_type_or_identity_impl(float); template using __value_type_or_identity_t = decltype(__value_type_or_identity_impl<_Tp>(int())); // }}} // __is_vectorizable {{{ template struct __is_vectorizable : public is_arithmetic<_Tp> {}; template <> struct __is_vectorizable : public false_type {}; template inline constexpr bool __is_vectorizable_v = __is_vectorizable<_Tp>::value; // Deduces to a vectorizable type template >> using _Vectorizable = _Tp; // }}} // _LoadStorePtr / __is_possible_loadstore_conversion {{{ template struct __is_possible_loadstore_conversion : conjunction<__is_vectorizable<_Ptr>, __is_vectorizable<_ValueType>> {}; template <> struct __is_possible_loadstore_conversion : true_type {}; // Deduces to a type allowed for load/store with the given value type. template ::value>> using _LoadStorePtr = _Ptr; // }}} // __is_bitmask{{{ template > struct __is_bitmask : false_type {}; template inline constexpr bool __is_bitmask_v = __is_bitmask<_Tp>::value; // the __mmaskXX case: template struct __is_bitmask<_Tp, void_t() = declval<_Tp>() & 1u)>> : true_type {}; // }}} // __int_for_sizeof{{{ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpedantic" template constexpr auto __int_for_sizeof() { if constexpr (_Bytes == sizeof(int)) return int(); #ifdef __clang__ else if constexpr (_Bytes == sizeof(char)) return char(); #else else if constexpr (_Bytes == sizeof(_SChar)) return _SChar(); #endif else if constexpr (_Bytes == sizeof(short)) return short(); #ifndef __clang__ else if constexpr (_Bytes == sizeof(long)) return long(); #endif else if constexpr (_Bytes == sizeof(_LLong)) return _LLong(); #ifdef __SIZEOF_INT128__ else if constexpr (_Bytes == sizeof(__int128)) return __int128(); #endif // __SIZEOF_INT128__ else if constexpr (_Bytes % sizeof(int) == 0) { constexpr size_t _Np = _Bytes / sizeof(int); struct _Ip { int _M_data[_Np]; _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator&(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) { return __rhs._M_data[__i] & _M_data[__i]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator|(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) { return __rhs._M_data[__i] | _M_data[__i]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator^(_Ip __rhs) const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) { return __rhs._M_data[__i] ^ _M_data[__i]; }); } _GLIBCXX_SIMD_INTRINSIC constexpr _Ip operator~() const { return __generate_from_n_evaluations<_Np, _Ip>( [&](auto __i) { return ~_M_data[__i]; }); } }; return _Ip{}; } else static_assert(_Bytes != _Bytes, "this should be unreachable"); } #pragma GCC diagnostic pop template using __int_for_sizeof_t = decltype(__int_for_sizeof()); template using __int_with_sizeof_t = decltype(__int_for_sizeof<_Np>()); // }}} // __is_fixed_size_abi{{{ template struct __is_fixed_size_abi : false_type {}; template struct __is_fixed_size_abi> : true_type {}; template inline constexpr bool __is_fixed_size_abi_v = __is_fixed_size_abi<_Tp>::value; // }}} // constexpr feature detection{{{ constexpr inline bool __have_mmx = _GLIBCXX_SIMD_HAVE_MMX; constexpr inline bool __have_sse = _GLIBCXX_SIMD_HAVE_SSE; constexpr inline bool __have_sse2 = _GLIBCXX_SIMD_HAVE_SSE2; constexpr inline bool __have_sse3 = _GLIBCXX_SIMD_HAVE_SSE3; constexpr inline bool __have_ssse3 = _GLIBCXX_SIMD_HAVE_SSSE3; constexpr inline bool __have_sse4_1 = _GLIBCXX_SIMD_HAVE_SSE4_1; constexpr inline bool __have_sse4_2 = _GLIBCXX_SIMD_HAVE_SSE4_2; constexpr inline bool __have_xop = _GLIBCXX_SIMD_HAVE_XOP; constexpr inline bool __have_avx = _GLIBCXX_SIMD_HAVE_AVX; constexpr inline bool __have_avx2 = _GLIBCXX_SIMD_HAVE_AVX2; constexpr inline bool __have_bmi = _GLIBCXX_SIMD_HAVE_BMI1; constexpr inline bool __have_bmi2 = _GLIBCXX_SIMD_HAVE_BMI2; constexpr inline bool __have_lzcnt = _GLIBCXX_SIMD_HAVE_LZCNT; constexpr inline bool __have_sse4a = _GLIBCXX_SIMD_HAVE_SSE4A; constexpr inline bool __have_fma = _GLIBCXX_SIMD_HAVE_FMA; constexpr inline bool __have_fma4 = _GLIBCXX_SIMD_HAVE_FMA4; constexpr inline bool __have_f16c = _GLIBCXX_SIMD_HAVE_F16C; constexpr inline bool __have_popcnt = _GLIBCXX_SIMD_HAVE_POPCNT; constexpr inline bool __have_avx512f = _GLIBCXX_SIMD_HAVE_AVX512F; constexpr inline bool __have_avx512dq = _GLIBCXX_SIMD_HAVE_AVX512DQ; constexpr inline bool __have_avx512vl = _GLIBCXX_SIMD_HAVE_AVX512VL; constexpr inline bool __have_avx512bw = _GLIBCXX_SIMD_HAVE_AVX512BW; constexpr inline bool __have_avx512dq_vl = __have_avx512dq && __have_avx512vl; constexpr inline bool __have_avx512bw_vl = __have_avx512bw && __have_avx512vl; constexpr inline bool __have_neon = _GLIBCXX_SIMD_HAVE_NEON; constexpr inline bool __have_neon_a32 = _GLIBCXX_SIMD_HAVE_NEON_A32; constexpr inline bool __have_neon_a64 = _GLIBCXX_SIMD_HAVE_NEON_A64; constexpr inline bool __support_neon_float = #if defined __GCC_IEC_559 __GCC_IEC_559 == 0; #elif defined __FAST_MATH__ true; #else false; #endif #ifdef _ARCH_PWR10 constexpr inline bool __have_power10vec = true; #else constexpr inline bool __have_power10vec = false; #endif #ifdef __POWER9_VECTOR__ constexpr inline bool __have_power9vec = true; #else constexpr inline bool __have_power9vec = false; #endif #if defined __POWER8_VECTOR__ constexpr inline bool __have_power8vec = true; #else constexpr inline bool __have_power8vec = __have_power9vec; #endif #if defined __VSX__ constexpr inline bool __have_power_vsx = true; #else constexpr inline bool __have_power_vsx = __have_power8vec; #endif #if defined __ALTIVEC__ constexpr inline bool __have_power_vmx = true; #else constexpr inline bool __have_power_vmx = __have_power_vsx; #endif // }}} // __is_scalar_abi {{{ template constexpr bool __is_scalar_abi() { return is_same_v; } // }}} // __abi_bytes_v {{{ template