13#ifndef EIGEN_PACKET_MATH_MSA_H
14#define EIGEN_PACKET_MATH_MSA_H
20#include "../../InternalHeaderCheck.h"
26#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
27#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
30#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
31#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
34#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
35#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
39#define EIGEN_MSA_DEBUG \
40 static bool firstTime = true; \
43 std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
48#define EIGEN_MSA_DEBUG
51#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
53typedef v4f32 Packet4f;
54typedef v4i32 Packet4i;
55typedef v4u32 Packet4ui;
57#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X}
58#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X}
59#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X}
61inline std::ostream& operator<<(std::ostream& os,
const Packet4f& value) {
62 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
66inline std::ostream& operator<<(std::ostream& os,
const Packet4i& value) {
67 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
71inline std::ostream& operator<<(std::ostream& os,
const Packet4ui& value) {
72 os <<
"[ " << value[0] <<
", " << value[1] <<
", " << value[2] <<
", " << value[3] <<
" ]";
77struct packet_traits<float> : default_packet_traits {
78 typedef Packet4f type;
79 typedef Packet4f half;
86 HasSin = EIGEN_FAST_MATH,
87 HasCos = EIGEN_FAST_MATH,
88 HasTanh = EIGEN_FAST_MATH,
89 HasErf = EIGEN_FAST_MATH,
99struct packet_traits<int32_t> : default_packet_traits {
100 typedef Packet4i type;
101 typedef Packet4i half;
113struct unpacket_traits<Packet4f> {
119 masked_load_available =
false,
120 masked_store_available =
false
122 typedef Packet4f half;
126struct unpacket_traits<Packet4i> {
127 typedef int32_t type;
132 masked_load_available =
false,
133 masked_store_available =
false
135 typedef Packet4i half;
139EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(
const float& from) {
142 Packet4f v = {from, from, from, from};
147EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(
const int32_t& from) {
150 return __builtin_msa_fill_w(from);
154EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(
const float* from) {
158 Packet4f v = {f, f, f, f};
163EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(
const int32_t* from) {
166 return __builtin_msa_fill_w(*from);
170EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
173 return __builtin_msa_fadd_w(a, b);
177EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
180 return __builtin_msa_addv_w(a, b);
184EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(
const float& a) {
187 static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
188 return padd(pset1<Packet4f>(a), countdown);
192EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(
const int32_t& a) {
195 static const Packet4i countdown = {0, 1, 2, 3};
196 return padd(pset1<Packet4i>(a), countdown);
200EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
203 return __builtin_msa_fsub_w(a, b);
207EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
210 return __builtin_msa_subv_w(a, b);
214EIGEN_STRONG_INLINE Packet4f pnegate(
const Packet4f& a) {
217 return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
221EIGEN_STRONG_INLINE Packet4i pnegate(
const Packet4i& a) {
224 return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
228EIGEN_STRONG_INLINE Packet4f pconj(
const Packet4f& a) {
235EIGEN_STRONG_INLINE Packet4i pconj(
const Packet4i& a) {
242EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
245 return __builtin_msa_fmul_w(a, b);
249EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
252 return __builtin_msa_mulv_w(a, b);
256EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
259 return __builtin_msa_fdiv_w(a, b);
263EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
266 return __builtin_msa_div_s_w(a, b);
270EIGEN_STRONG_INLINE Packet4f pmadd(
const Packet4f& a,
const Packet4f& b,
const Packet4f& c) {
273 return __builtin_msa_fmadd_w(c, a, b);
277EIGEN_STRONG_INLINE Packet4i pmadd(
const Packet4i& a,
const Packet4i& b,
const Packet4i& c) {
282 __asm__(
"maddv.w %w[value], %w[a], %w[b]\n"
284 : [value]
"+f"(value)
286 : [a]
"f"(a), [b]
"f"(b));
291EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
294 return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
298EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
301 return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
305EIGEN_STRONG_INLINE Packet4f por<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
308 return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
312EIGEN_STRONG_INLINE Packet4i por<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
315 return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
319EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
322 return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
326EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
329 return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
333EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
336 return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
340EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
343 return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
347EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
352 return __builtin_msa_fmin_w(a, b);
355 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
356 Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
357 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
362EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
365 return __builtin_msa_min_s_w(a, b);
369EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(
const Packet4f& a,
const Packet4f& b) {
374 return __builtin_msa_fmax_w(a, b);
377 Packet4i aNaN = __builtin_msa_fcun_w(a, a);
378 Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
379 return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
384EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(
const Packet4i& a,
const Packet4i& b) {
387 return __builtin_msa_max_s_w(a, b);
391EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(
const float* from) {
394 EIGEN_DEBUG_ALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(
const_cast<float*
>(from), 0);
398EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(
const int32_t* from) {
401 EIGEN_DEBUG_ALIGNED_LOAD
return __builtin_msa_ld_w(
const_cast<int32_t*
>(from), 0);
405EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(
const float* from) {
408 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4f)__builtin_msa_ld_w(
const_cast<float*
>(from), 0);
412EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(
const int32_t* from) {
415 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet4i)__builtin_msa_ld_w(
const_cast<int32_t*
>(from), 0);
419EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(
const float* from) {
422 float f0 = from[0], f1 = from[1];
423 Packet4f v0 = {f0, f0, f0, f0};
424 Packet4f v1 = {f1, f1, f1, f1};
425 return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
429EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(
const int32_t* from) {
432 int32_t i0 = from[0], i1 = from[1];
433 Packet4i v0 = {i0, i0, i0, i0};
434 Packet4i v1 = {i1, i1, i1, i1};
435 return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
439EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const Packet4f& from) {
442 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
446EIGEN_STRONG_INLINE
void pstore<int32_t>(int32_t* to,
const Packet4i& from) {
449 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
453EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const Packet4f& from) {
456 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
460EIGEN_STRONG_INLINE
void pstoreu<int32_t>(int32_t* to,
const Packet4i& from) {
463 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
467EIGEN_DEVICE_FUNC
inline Packet4f pgather<float, Packet4f>(
const float* from, Index stride) {
471 Packet4f v = {f, f, f, f};
473 v[2] = from[2 * stride];
474 v[3] = from[3 * stride];
479EIGEN_DEVICE_FUNC
inline Packet4i pgather<int32_t, Packet4i>(
const int32_t* from, Index stride) {
483 Packet4i v = {i, i, i, i};
485 v[2] = from[2 * stride];
486 v[3] = from[3 * stride];
491EIGEN_DEVICE_FUNC
inline void pscatter<float, Packet4f>(
float* to,
const Packet4f& from, Index stride) {
504EIGEN_DEVICE_FUNC
inline void pscatter<int32_t, Packet4i>(int32_t* to,
const Packet4i& from, Index stride) {
517EIGEN_STRONG_INLINE
void prefetch<float>(
const float* addr) {
520 __builtin_prefetch(addr);
524EIGEN_STRONG_INLINE
void prefetch<int32_t>(
const int32_t* addr) {
527 __builtin_prefetch(addr);
531EIGEN_STRONG_INLINE
float pfirst<Packet4f>(
const Packet4f& a) {
538EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(
const Packet4i& a) {
545EIGEN_STRONG_INLINE Packet4f preverse(
const Packet4f& a) {
548 return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
552EIGEN_STRONG_INLINE Packet4i preverse(
const Packet4i& a) {
555 return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
559EIGEN_STRONG_INLINE Packet4f pabs(
const Packet4f& a) {
562 return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
566EIGEN_STRONG_INLINE Packet4i pabs(
const Packet4i& a) {
569 Packet4i zero = __builtin_msa_ldi_w(0);
570 return __builtin_msa_add_a_w(zero, a);
574EIGEN_STRONG_INLINE
float predux<Packet4f>(
const Packet4f& a) {
577 Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
578 s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
583EIGEN_STRONG_INLINE int32_t predux<Packet4i>(
const Packet4i& a) {
586 Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
587 s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
594EIGEN_STRONG_INLINE
float predux_mul<Packet4f>(
const Packet4f& a) {
597 Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
598 p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
603EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(
const Packet4i& a) {
606 Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
607 p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
613EIGEN_STRONG_INLINE
float predux_min<Packet4f>(
const Packet4f& a) {
617 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
621 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
623 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
626 Packet4f v = __builtin_msa_fmin_w(a, swapped);
627 v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
630 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
631 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
637EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(
const Packet4i& a) {
640 Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
641 m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
647EIGEN_STRONG_INLINE
float predux_max<Packet4f>(
const Packet4f& a) {
651 Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
655 v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
657 unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
660 Packet4f v = __builtin_msa_fmax_w(a, swapped);
661 v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
664 v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
665 v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
671EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(
const Packet4i& a) {
674 Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
675 m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
679inline std::ostream& operator<<(std::ostream& os,
const PacketBlock<Packet4f, 4>& value) {
680 os <<
"[ " << value.packet[0] <<
"," << std::endl
681 <<
" " << value.packet[1] <<
"," << std::endl
682 <<
" " << value.packet[2] <<
"," << std::endl
683 <<
" " << value.packet[3] <<
" ]";
687EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
690 v4i32 tmp1, tmp2, tmp3, tmp4;
692 tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
693 tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
694 tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
695 tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
697 kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
698 kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
699 kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
700 kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
703inline std::ostream& operator<<(std::ostream& os,
const PacketBlock<Packet4i, 4>& value) {
704 os <<
"[ " << value.packet[0] <<
"," << std::endl
705 <<
" " << value.packet[1] <<
"," << std::endl
706 <<
" " << value.packet[2] <<
"," << std::endl
707 <<
" " << value.packet[3] <<
" ]";
711EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
714 v4i32 tmp1, tmp2, tmp3, tmp4;
716 tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
717 tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
718 tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
719 tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
721 kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
722 kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
723 kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
724 kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
728EIGEN_STRONG_INLINE Packet4f psqrt(
const Packet4f& a) {
731 return __builtin_msa_fsqrt_w(a);
735EIGEN_STRONG_INLINE Packet4f prsqrt(
const Packet4f& a) {
739 return __builtin_msa_frsqrt_w(a);
741 Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
742 return pdiv(ones, psqrt(a));
747EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(
const Packet4f& a) {
749 int32_t old_mode, new_mode;
751 "cfcmsa %[old_mode], $1\n"
752 "ori %[new_mode], %[old_mode], 3\n"
753 "ctcmsa $1, %[new_mode]\n"
754 "frint.w %w[v], %w[v]\n"
755 "ctcmsa $1, %[old_mode]\n"
757 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
766EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(
const Packet4f& a) {
768 int32_t old_mode, new_mode;
770 "cfcmsa %[old_mode], $1\n"
771 "ori %[new_mode], %[old_mode], 3\n"
772 "xori %[new_mode], %[new_mode], 1\n"
773 "ctcmsa $1, %[new_mode]\n"
774 "frint.w %w[v], %w[v]\n"
775 "ctcmsa $1, %[old_mode]\n"
777 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
786EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(
const Packet4f& a) {
788 int32_t old_mode, new_mode;
790 "cfcmsa %[old_mode], $1\n"
791 "ori %[new_mode], %[old_mode], 3\n"
792 "xori %[new_mode], %[new_mode], 3\n"
793 "ctcmsa $1, %[new_mode]\n"
794 "frint.w %w[v], %w[v]\n"
795 "ctcmsa $1, %[old_mode]\n"
797 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
806EIGEN_STRONG_INLINE Packet4f pblend(
const Selector<4>& ifPacket,
const Packet4f& thenPacket,
807 const Packet4f& elsePacket) {
808 Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
809 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
810 return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
814EIGEN_STRONG_INLINE Packet4i pblend(
const Selector<4>& ifPacket,
const Packet4i& thenPacket,
815 const Packet4i& elsePacket) {
816 Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
817 Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
818 return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
823typedef v2f64 Packet2d;
824typedef v2i64 Packet2l;
825typedef v2u64 Packet2ul;
827#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X}
828#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X}
829#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X}
831inline std::ostream& operator<<(std::ostream& os,
const Packet2d& value) {
832 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
836inline std::ostream& operator<<(std::ostream& os,
const Packet2l& value) {
837 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
841inline std::ostream& operator<<(std::ostream& os,
const Packet2ul& value) {
842 os <<
"[ " << value[0] <<
", " << value[1] <<
" ]";
847struct packet_traits<double> : default_packet_traits {
848 typedef Packet2d type;
849 typedef Packet2d half;
864struct unpacket_traits<Packet2d> {
870 masked_load_available =
false,
871 masked_store_available =
false
873 typedef Packet2d half;
877EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(
const double& from) {
880 Packet2d value = {from, from};
885EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
888 return __builtin_msa_fadd_d(a, b);
892EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(
const double& a) {
895 static const Packet2d countdown = {0.0, 1.0};
896 return padd(pset1<Packet2d>(a), countdown);
900EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
903 return __builtin_msa_fsub_d(a, b);
907EIGEN_STRONG_INLINE Packet2d pnegate(
const Packet2d& a) {
910 return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
914EIGEN_STRONG_INLINE Packet2d pconj(
const Packet2d& a) {
921EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
924 return __builtin_msa_fmul_d(a, b);
928EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
931 return __builtin_msa_fdiv_d(a, b);
935EIGEN_STRONG_INLINE Packet2d pmadd(
const Packet2d& a,
const Packet2d& b,
const Packet2d& c) {
938 return __builtin_msa_fmadd_d(c, a, b);
944EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
947 return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
951EIGEN_STRONG_INLINE Packet2d por<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
954 return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
958EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
961 return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
965EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
968 return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
972EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(
const double* from) {
975 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(
const_cast<double*
>(from), 0);
979EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
984 return __builtin_msa_fmin_d(a, b);
987 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
988 v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
989 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
994EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(
const Packet2d& a,
const Packet2d& b) {
999 return __builtin_msa_fmax_d(a, b);
1002 v2i64 aNaN = __builtin_msa_fcun_d(a, a);
1003 v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
1004 return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
1009EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(
const double* from) {
1012 EIGEN_DEBUG_UNALIGNED_LOAD
return (Packet2d)__builtin_msa_ld_d(
const_cast<double*
>(from), 0);
1016EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(
const double* from) {
1019 Packet2d value = {*from, *from};
1024EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const Packet2d& from) {
1027 EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1031EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const Packet2d& from) {
1034 EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
1038EIGEN_DEVICE_FUNC
inline Packet2d pgather<double, Packet2d>(
const double* from, Index stride) {
1049EIGEN_DEVICE_FUNC
inline void pscatter<double, Packet2d>(
double* to,
const Packet2d& from, Index stride) {
1058EIGEN_STRONG_INLINE
void prefetch<double>(
const double* addr) {
1061 __builtin_prefetch(addr);
1065EIGEN_STRONG_INLINE
double pfirst<Packet2d>(
const Packet2d& a) {
1072EIGEN_STRONG_INLINE Packet2d preverse(
const Packet2d& a) {
1075 return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1079EIGEN_STRONG_INLINE Packet2d pabs(
const Packet2d& a) {
1082 return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
1086EIGEN_STRONG_INLINE
double predux<Packet2d>(
const Packet2d& a) {
1089 Packet2d s = padd(a, preverse(a));
1096EIGEN_STRONG_INLINE
double predux_mul<Packet2d>(
const Packet2d& a) {
1099 Packet2d p = pmul(a, preverse(a));
1105EIGEN_STRONG_INLINE
double predux_min<Packet2d>(
const Packet2d& a) {
1109 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1110 Packet2d v = __builtin_msa_fmin_d(a, swapped);
1113 double a0 = a[0], a1 = a[1];
1114 return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
1120EIGEN_STRONG_INLINE
double predux_max<Packet2d>(
const Packet2d& a) {
1124 Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
1125 Packet2d v = __builtin_msa_fmax_d(a, swapped);
1128 double a0 = a[0], a1 = a[1];
1129 return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
1134EIGEN_STRONG_INLINE Packet2d psqrt(
const Packet2d& a) {
1137 return __builtin_msa_fsqrt_d(a);
1141EIGEN_STRONG_INLINE Packet2d prsqrt(
const Packet2d& a) {
1145 return __builtin_msa_frsqrt_d(a);
1147 Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
1148 return pdiv(ones, psqrt(a));
1152inline std::ostream& operator<<(std::ostream& os,
const PacketBlock<Packet2d, 2>& value) {
1153 os <<
"[ " << value.packet[0] <<
"," << std::endl <<
" " << value.packet[1] <<
" ]";
1157EIGEN_DEVICE_FUNC
inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
1160 Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1161 Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
1162 kernel.packet[0] = trn1;
1163 kernel.packet[1] = trn2;
1167EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(
const Packet2d& a) {
1169 int32_t old_mode, new_mode;
1171 "cfcmsa %[old_mode], $1\n"
1172 "ori %[new_mode], %[old_mode], 3\n"
1173 "ctcmsa $1, %[new_mode]\n"
1174 "frint.d %w[v], %w[v]\n"
1175 "ctcmsa $1, %[old_mode]\n"
1177 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1186EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(
const Packet2d& a) {
1188 int32_t old_mode, new_mode;
1190 "cfcmsa %[old_mode], $1\n"
1191 "ori %[new_mode], %[old_mode], 3\n"
1192 "xori %[new_mode], %[new_mode], 1\n"
1193 "ctcmsa $1, %[new_mode]\n"
1194 "frint.d %w[v], %w[v]\n"
1195 "ctcmsa $1, %[old_mode]\n"
1197 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1206EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(
const Packet2d& a) {
1208 int32_t old_mode, new_mode;
1210 "cfcmsa %[old_mode], $1\n"
1211 "ori %[new_mode], %[old_mode], 3\n"
1212 "xori %[new_mode], %[new_mode], 3\n"
1213 "ctcmsa $1, %[new_mode]\n"
1214 "frint.d %w[v], %w[v]\n"
1215 "ctcmsa $1, %[old_mode]\n"
1217 [old_mode]
"=r"(old_mode), [new_mode]
"=r"(new_mode),
1226EIGEN_STRONG_INLINE Packet2d pblend(
const Selector<2>& ifPacket,
const Packet2d& thenPacket,
1227 const Packet2d& elsePacket) {
1228 Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
1229 Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
1230 return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
@ Aligned16
Definition Constants.h:237
Namespace containing all symbols from the Eigen library.
Definition Core:137