10#ifndef EIGEN_TYPE_CASTING_AVX_H
11#define EIGEN_TYPE_CASTING_AVX_H
14#include "../../InternalHeaderCheck.h"
20#ifndef EIGEN_VECTORIZE_AVX512
22struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
24struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
27struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
29struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
32struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
34struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
37struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
39struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
42struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
44struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
47struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
49struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
51#ifdef EIGEN_VECTORIZE_AVX2
53struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
55struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
60EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(
const Packet8f& a,
const Packet8f& b) {
61 __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ);
62 __m256 nonzero_b = _mm256_cmp_ps(b, pzero(b), _CMP_NEQ_UQ);
63 constexpr char kFF =
'\255';
64#ifndef EIGEN_VECTORIZE_AVX2
65 __m128i shuffle_mask128_a_lo = _mm_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
66 __m128i shuffle_mask128_a_hi = _mm_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF);
67 __m128i shuffle_mask128_b_lo = _mm_set_epi8(kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
68 __m128i shuffle_mask128_b_hi = _mm_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
69 __m128i a_hi = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_a), 1), shuffle_mask128_a_hi);
70 __m128i a_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_a), 0), shuffle_mask128_a_lo);
71 __m128i b_hi = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 1), shuffle_mask128_b_hi);
72 __m128i b_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 0), shuffle_mask128_b_lo);
73 __m128i merged = _mm_or_si128(_mm_or_si128(b_lo, b_hi), _mm_or_si128(a_lo, a_hi));
74 return _mm_and_si128(merged, _mm_set1_epi8(1));
76 __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF,
77 kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
78 __m256i b_shuffle_mask = _mm256_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,
79 kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
80 __m256i a_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_a), a_shuffle_mask);
81 __m256i b_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_b), b_shuffle_mask);
82 __m256i a_or_b = _mm256_or_si256(a_shuff, b_shuff);
83 __m256i merged = _mm256_or_si256(a_or_b, _mm256_castsi128_si256(_mm256_extractf128_si256(a_or_b, 1)));
84 return _mm256_castsi256_si128(_mm256_and_si256(merged, _mm256_set1_epi8(1)));
89EIGEN_STRONG_INLINE Packet8f pcast<Packet16b, Packet8f>(
const Packet16b& a) {
90 const __m256 cst_one = _mm256_set1_ps(1.0f);
91#ifdef EIGEN_VECTORIZE_AVX2
92 __m256i a_extended = _mm256_cvtepi8_epi32(a);
93 __m256i abcd_efgh = _mm256_cmpeq_epi32(a_extended, _mm256_setzero_si256());
95 __m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
96 __m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
97 __m128i aaaa_bbbb_cccc_dddd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
98 __m128i eeee_ffff_gggg_hhhh = _mm_unpackhi_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
99 __m256i abcd_efgh = _mm256_setr_m128i(aaaa_bbbb_cccc_dddd, eeee_ffff_gggg_hhhh);
101 __m256 result = _mm256_andnot_ps(_mm256_castsi256_ps(abcd_efgh), cst_one);
106EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(
const Packet8f& a) {
107 return _mm256_cvttps_epi32(a);
111EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(
const Packet4d& a,
const Packet4d& b) {
112 return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
116EIGEN_STRONG_INLINE Packet4i pcast<Packet4d, Packet4i>(
const Packet4d& a) {
117 return _mm256_cvttpd_epi32(a);
121EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(
const Packet8i& a) {
122 return _mm256_cvtepi32_ps(a);
126EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(
const Packet4d& a,
const Packet4d& b) {
127 return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
131EIGEN_STRONG_INLINE Packet4f pcast<Packet4d, Packet4f>(
const Packet4d& a) {
132 return _mm256_cvtpd_ps(a);
136EIGEN_STRONG_INLINE Packet4d pcast<Packet8i, Packet4d>(
const Packet8i& a) {
137 return _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
141EIGEN_STRONG_INLINE Packet4d pcast<Packet4i, Packet4d>(
const Packet4i& a) {
142 return _mm256_cvtepi32_pd(a);
146EIGEN_STRONG_INLINE Packet4d pcast<Packet8f, Packet4d>(
const Packet8f& a) {
147 return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
151EIGEN_STRONG_INLINE Packet4d pcast<Packet4f, Packet4d>(
const Packet4f& a) {
152 return _mm256_cvtps_pd(a);
156EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8f>(
const Packet8f& a) {
157 return _mm256_castps_si256(a);
161EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet8i>(
const Packet8i& a) {
162 return _mm256_castsi256_ps(a);
166EIGEN_STRONG_INLINE Packet8ui preinterpret<Packet8ui, Packet8i>(
const Packet8i& a) {
171EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8ui>(
const Packet8ui& a) {
178EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet8f>(
const Packet8f& a) {
179 return _mm256_castps256_ps128(a);
183EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4d>(
const Packet4d& a) {
184 return _mm256_castpd256_pd128(a);
188EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet8i>(
const Packet8i& a) {
189 return _mm256_castsi256_si128(a);
193EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(
const Packet8ui& a) {
194 return _mm256_castsi256_si128(a);
197#ifdef EIGEN_VECTORIZE_AVX2
199EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(
const Packet4d& a) {
200#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
201 return _mm256_cvttpd_epi64(a);
210 constexpr int kTotalBits =
sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
211 kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
213 const __m256i cst_one = _mm256_set1_epi64x(1);
214 const __m256i cst_total_bits = _mm256_set1_epi64x(kTotalBits);
215 const __m256i cst_bias = _mm256_set1_epi64x(kBias);
217 __m256i a_bits = _mm256_castpd_si256(a);
219 __m256i biased_e = _mm256_srli_epi64(_mm256_slli_epi64(a_bits, 1), kMantissaBits + 1);
220 __m256i e = _mm256_sub_epi64(biased_e, cst_bias);
223 __m256i shifted_mantissa = _mm256_slli_epi64(a_bits, kExponentBits + 1);
225 __m256i result_significand = _mm256_srlv_epi64(shifted_mantissa, _mm256_sub_epi64(cst_total_bits, e));
228 __m256i result_exponent = _mm256_sllv_epi64(cst_one, e);
230 __m256i result = _mm256_add_epi64(result_significand, result_exponent);
232 __m256i sign_mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a_bits);
233 result = _mm256_sub_epi64(_mm256_xor_si256(result, sign_mask), sign_mask);
239EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(
const Packet4l& a) {
240#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
241 return _mm256_cvtepi64_pd(a);
243 EIGEN_ALIGN16 int64_t aux[4];
245 return _mm256_set_pd(
static_cast<double>(aux[3]),
static_cast<double>(aux[2]),
static_cast<double>(aux[1]),
246 static_cast<double>(aux[0]));
251EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(
const Packet2l& a,
const Packet2l& b) {
252 return _mm256_set_m128d((pcast<Packet2l, Packet2d>(b)), (pcast<Packet2l, Packet2d>(a)));
256EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(
const Packet4l& a) {
261EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(
const Packet4ul& a) {
266EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4d>(
const Packet4d& a) {
267 return _mm256_castpd_si256(a);
271EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet4l>(
const Packet4l& a) {
272 return _mm256_castsi256_pd(a);
277EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet4l>(
const Packet4l& a) {
278 return _mm256_castsi256_si128(a);
283EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(
const Packet8h& a) {
284 return half2float(a);
288EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(
const Packet8bf& a) {
293EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(
const Packet8f& a) {
294 return float2half(a);
298EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(
const Packet8f& a) {
Namespace containing all symbols from the Eigen library.
Definition Core:137