Eigen  3.4.90 (git rev 5a9f66fb35d03a4da9ef8976e67a61b30aa16dcf)
 
Loading...
Searching...
No Matches
HVX/PacketMath.h
1
2#ifndef EIGEN_HVX_PACKET_MATH_H
3#define EIGEN_HVX_PACKET_MATH_H
4
5// Only support 128B HVX now.
6// Floating-point operations are supported only since V68.
7#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
8
9// All the floating-point operations do not support IEEE standard.
10// From HVX document:
11// There is no concept of infinity or NaN. QFloat saturates to maximum
12// exponent with maximum positive or minimum negative significand.
13
14#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
15#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
16#endif
17
18namespace Eigen {
19namespace internal {
20
21// HVX utilities.
22
23template <int D>
24EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
25 HVX_Vector v;
26#if EIGEN_COMP_CLANG
27 // Use inlined assembly for aligned vmem load on unaligned memory.
28 // Use type cast to HVX_Vector* may mess up with compiler data alignment.
29 __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
30#else
31 void* aligned_mem =
32 reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
33 memcpy(&v, aligned_mem, __HVX_LENGTH__);
34#endif
35 return v;
36}
37
38template <typename T>
39EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
40 HVX_Vector v;
41 memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
42 return v;
43}
44
45template <typename T>
46EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
47 HVX_Vector v;
48 memcpy(&v, mem, __HVX_LENGTH__);
49 return v;
50}
51
52template <size_t Size, size_t Alignment, typename T>
53EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
54#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
55 // Fast partial vector load through aligned vmem load.
56 // The load may past end of array but is aligned to prevent memory fault.
57 HVX_Vector v0 = HVX_vmem<0>(mem);
58 HVX_Vector v1 = v0;
59 uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
60 EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
61 // Data size less than alignment will never cross multiple aligned vectors.
62 v1 = v0;
63 }
64 else {
65 uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
66 if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
67 v1 = HVX_vmem<1>(mem);
68 } else {
69 v1 = v0;
70 }
71 }
72 return Q6_V_valign_VVR(v1, v0, mem_addr);
73#else
74 HVX_Vector v;
75 memcpy(&v, mem, Size * sizeof(T));
76 return v;
77#endif
78}
79
80template <typename T>
81EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
82 memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
83}
84
85template <typename T>
86EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
87 memcpy(mem, &v, __HVX_LENGTH__);
88}
89
90template <size_t Size, size_t Alignment, typename T>
91EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
92 uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
93 HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
94 uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
95 uintptr_t right_off = left_off + Size * sizeof(T);
96
97 HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
98 HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
99
100 EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
101 if (right_off > __HVX_LENGTH__) {
102 Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
103 qr = Q6_Q_vcmp_eq_VbVb(value, value);
104 }
105 }
106
107 ql_not = Q6_Q_or_QQn(ql_not, qr);
108 Q6_vmem_QnRIV(ql_not, mem, value);
109}
110
111// Packet definitions.
112enum class HVXPacketSize {
113 Full,
114 Half,
115 Quarter,
116};
117
118// Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
119// Wrap different vector type (float32, int32, etc) to different class with
120// explicit constructor and casting back-and-force to HVX_Vector.
121template <HVXPacketSize T>
122class HVXPacket {
123 public:
124 HVXPacket() = default;
125 static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
126 HVX_Vector Get() const { return m_val; }
127
128 private:
129 explicit HVXPacket(HVX_Vector v) : m_val(v) {}
130 HVX_Vector m_val = Q6_V_vzero();
131};
132
133typedef HVXPacket<HVXPacketSize::Full> Packet32f;
134typedef HVXPacket<HVXPacketSize::Half> Packet16f;
135typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
136
137// Packet traits.
138template <>
139struct packet_traits<float> : default_packet_traits {
140 typedef Packet32f type;
141 typedef Packet16f half;
142 enum {
143 Vectorizable = 1,
144 AlignedOnScalar = 1,
145 size = 32,
146
147 HasCmp = 1,
148 HasAdd = 1,
149 HasSub = 1,
150 HasShift = 0,
151 HasMul = 1,
152 HasNegate = 1,
153 HasAbs = 1,
154 HasArg = 0,
155 HasAbs2 = 0,
156 HasAbsDiff = 0,
157 HasMin = 1,
158 HasMax = 1,
159 HasConj = 0,
160 HasSetLinear = 0,
161 HasBlend = 0,
162
163 HasDiv = 0,
164
165 HasSin = 0,
166 HasCos = 0,
167 HasACos = 0,
168 HasASin = 0,
169 HasATan = 0,
170 HasATanh = 0,
171 HasLog = 0,
172 HasExp = 0,
173 HasSqrt = 0,
174 HasRsqrt = 0,
175 HasTanh = 0,
176 HasErf = 0,
177 HasBessel = 0,
178 HasNdtri = 0
179 };
180};
181
182template <>
183struct unpacket_traits<Packet32f> {
184 typedef float type;
185 typedef Packet16f half;
186 enum {
187 size = 32,
188 alignment = Aligned128,
189 vectorizable = true,
190 masked_load_available = false,
191 masked_store_available = false
192 };
193};
194
195template <>
196struct unpacket_traits<Packet16f> {
197 typedef float type;
198 typedef Packet8f half;
199 enum {
200 size = 16,
201 // Many code assume alignment on packet size instead of following trait
202 // So we do not use Aligned128 to optimize aligned load/store,
203 alignment = Aligned64,
204 vectorizable = true,
205 masked_load_available = false,
206 masked_store_available = false
207 };
208};
209
210template <>
211struct unpacket_traits<Packet8f> {
212 typedef float type;
213 typedef Packet8f half;
214 enum {
215 size = 8,
216 // Many code assume alignment on packet size instead of following trait
217 // So we do not use Aligned128 to optimize aligned load/store,
218 alignment = Aligned32,
219 vectorizable = true,
220 masked_load_available = false,
221 masked_store_available = false
222 };
223};
224
225// float32 operations.
226template <HVXPacketSize T>
227EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
228 return HVXPacket<T>::Create(Q6_V_vzero());
229}
230template <>
231EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
232 return pzero_hvx(Packet32f());
233}
234template <>
235EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
236 return pzero_hvx(Packet16f());
237}
238template <>
239EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
240 return pzero_hvx(Packet8f());
241}
242
243template <HVXPacketSize T>
244EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
245 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
246 return unpacket_traits<HVXPacket<T>>::half::Create(
247 Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
248}
249template <>
250EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
251 return predux_half_dowto4_hvx(a);
252}
253template <>
254EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
255 return predux_half_dowto4_hvx(a);
256}
257
258template <HVXPacketSize T>
259EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
260 union {
261 float f;
262 int32_t i;
263 } u;
264 u.f = from;
265 return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
266}
267template <>
268EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
269 return pset1_hvx<HVXPacketSize::Full>(from);
270}
271template <>
272EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
273 return pset1_hvx<HVXPacketSize::Half>(from);
274}
275template <>
276EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
277 return pset1_hvx<HVXPacketSize::Quarter>(from);
278}
279
280template <>
281EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
282 return Packet32f::Create(HVX_load(from));
283}
284template <>
285EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
286 return Packet16f::Create(
287 HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
288}
289template <>
290EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
291 return Packet8f::Create(
292 HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
293}
294
295template <>
296EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
297 return Packet32f::Create(HVX_loadu(from));
298}
299template <>
300EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
301 return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
302}
303template <>
304EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
305 return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
306}
307
308template <>
309EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
310 HVX_store(to, from.Get());
311}
312template <>
313EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
314 HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
315}
316template <>
317EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
318 HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
319}
320
321template <>
322EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
323 HVX_storeu(to, from.Get());
324}
325template <>
326EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
327 HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
328}
329template <>
330EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
331 HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
332}
333
334template <HVXPacketSize T>
335EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
336 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
337}
338template <>
339EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
340 return pmul_hvx(a, b);
341}
342template <>
343EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
344 return pmul_hvx(a, b);
345}
346template <>
347EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
348 return pmul_hvx(a, b);
349}
350
351template <HVXPacketSize T>
352EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
353 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
354}
355template <>
356EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
357 return padd_hvx(a, b);
358}
359template <>
360EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
361 return padd_hvx(a, b);
362}
363template <>
364EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
365 return padd_hvx(a, b);
366}
367
368template <HVXPacketSize T>
369EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
370 return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
371}
372template <>
373EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
374 return psub_hvx(a, b);
375}
376template <>
377EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
378 return psub_hvx(a, b);
379}
380template <>
381EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
382 return psub_hvx(a, b);
383}
384
385template <HVXPacketSize T>
386EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
387 return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
388}
389template <>
390EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
391 return pnegate_hvx(a);
392}
393template <>
394EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
395 return pnegate_hvx(a);
396}
397template <>
398EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
399 return pnegate_hvx(a);
400}
401
402template <HVXPacketSize T>
403EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
404 HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
405 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
406 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
407}
408template <>
409EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
410 return pcmp_le_hvx(a, b);
411}
412template <>
413EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
414 return pcmp_le_hvx(a, b);
415}
416template <>
417EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
418 return pcmp_le_hvx(a, b);
419}
420
421template <HVXPacketSize T>
422EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
423 HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
424 HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
425 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
426}
427template <>
428EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
429 return pcmp_eq_hvx(a, b);
430}
431template <>
432EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
433 return pcmp_eq_hvx(a, b);
434}
435template <>
436EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
437 return pcmp_eq_hvx(a, b);
438}
439
440template <HVXPacketSize T>
441EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
442 HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
443 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
444 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
445}
446template <>
447EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
448 return pcmp_lt_hvx(a, b);
449}
450template <>
451EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
452 return pcmp_lt_hvx(a, b);
453}
454template <>
455EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
456 return pcmp_lt_hvx(a, b);
457}
458
459template <HVXPacketSize T>
460EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
461 HVX_Vector v_true = Q6_Vb_vsplat_R(0xff);
462 HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
463 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
464}
465template <>
466EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
467 return pcmp_lt_or_nan_hvx(a, b);
468}
469template <>
470EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
471 return pcmp_lt_or_nan_hvx(a, b);
472}
473template <>
474EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
475 return pcmp_lt_or_nan_hvx(a, b);
476}
477
478template <HVXPacketSize T>
479EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
480 return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
481}
482template <>
483EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
484 return pabs_hvx(a);
485}
486template <>
487EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
488 return pabs_hvx(a);
489}
490template <>
491EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
492 return pabs_hvx(a);
493}
494
495template <HVXPacketSize T>
496EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
497 union {
498 float array[1];
499 HVX_Vector vector;
500 } HVX_and_array;
501 HVX_and_array.vector = a.Get();
502 return HVX_and_array.array[0];
503}
504template <>
505EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
506 return pfirst_hvx(a);
507}
508template <>
509EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
510 return pfirst_hvx(a);
511}
512template <>
513EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
514 return pfirst_hvx(a);
515}
516
517EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
518 // Shuffle the 32-bit lanes.
519 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
520 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
521
522 // Shuffle the 64-bit lanes.
523 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
524 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
525 kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
526 kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
527 kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
528 kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
529}
530EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
531 // Shuffle the 32-bit lanes.
532 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
533 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
534
535 // Shuffle the 64-bit lanes.
536 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
537
538 kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
539 kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
540 kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
541 kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
542}
543EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
544 // Shuffle the 32-bit lanes.
545 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
546 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
547
548 // Shuffle the 64-bit lanes.
549 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
550
551 kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
552 kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
553 kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
554 kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
555}
556
557EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
558 // Shuffle the 32-bit lanes.
559 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
560 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
561 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
562 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
563
564 // Shuffle the 64-bit lanes.
565 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
566 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
567
568 // Shuffle the 128-bit lanes.
569 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
570
571 kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
572 kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
573 kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
574 kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
575 kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
576 kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
577 kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
578 kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
579}
580EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
581 // Shuffle the 32-bit lanes.
582 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
583 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
584 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
585 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
586 HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
587 HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
588 HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
589 HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
590
591 // Shuffle the 64-bit lanes.
592 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
593 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
594 HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
595 HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
596
597 // Shuffle the 128-bit lanes.
598 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
599 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
600 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
601 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
602
603 // Shuffle the 256-bit lanes.
604 v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
605 v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
606 v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
607 v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
608
609 kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
610 kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
611 kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
612 kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
613 kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
614 kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
615 kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
616 kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
617 kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
618 kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
619 kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
620 kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
621 kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
622 kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
623 kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
624 kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
625}
626EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
627 // Shuffle the 32-bit lanes.
628 HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
629 HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
630 HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
631 HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
632 HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
633 HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
634 HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
635 HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
636 HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
637 HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
638 HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
639 HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
640 HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
641 HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
642 HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
643 HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
644
645 // Shuffle the 64-bit lanes.
646 HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
647 HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
648 HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
649 HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
650 HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
651 HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
652 HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
653 HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
654 HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
655 HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
656 HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
657 HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
658 HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
659 HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
660 HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
661 HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
662
663 // Shuffle the 128-bit lanes.
664 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
665 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
666 v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
667 v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
668 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
669 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
670 v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
671 v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
672 v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
673 v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
674 v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
675 v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
676 v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
677 v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
678 v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
679 v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
680
681 // Shuffle the 256-bit lanes.
682 v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
683 v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
684 v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
685 v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
686 v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
687 v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
688 v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
689 v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
690 v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
691 v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
692 v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
693 v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
694 v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
695 v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
696 v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
697 v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
698
699 // Shuffle the 512-bit lanes.
700 v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
701 v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
702 v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
703 v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
704 v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
705 v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
706 v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
707 v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
708 v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
709 v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
710 v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
711 v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
712 v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
713 v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
714 v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
715 v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
716
717 kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
718 kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
719 kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
720 kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
721 kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
722 kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
723 kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
724 kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
725 kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
726 kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
727 kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
728 kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
729 kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
730 kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
731 kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
732 kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
733 kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
734 kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
735 kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
736 kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
737 kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
738 kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
739 kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
740 kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
741 kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
742 kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
743 kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
744 kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
745 kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
746 kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
747 kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
748 kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
749}
750
751template <HVXPacketSize T>
752EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
753 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
754 HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
755 for (int i = 2; i < packet_size; i <<= 1) {
756 vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
757 }
758 return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
759}
760template <>
761EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
762 return predux_hvx(a);
763}
764template <>
765EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
766 return predux_hvx(a);
767}
768template <>
769EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
770 return predux_hvx(a);
771}
772
773template <HVXPacketSize T>
774EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
775 constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
776 HVX_Vector load = HVX_load_partial<size, 0>(from);
777 HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
778 return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
779}
780template <>
781EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
782 return ploaddup_hvx<HVXPacketSize::Full>(from);
783}
784template <>
785EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
786 return ploaddup_hvx<HVXPacketSize::Half>(from);
787}
788template <>
789EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
790 return ploaddup_hvx<HVXPacketSize::Quarter>(from);
791}
792
793template <HVXPacketSize T>
794EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
795 constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
796 HVX_Vector load = HVX_load_partial<size, 0>(from);
797 HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
798 HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
799 return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
800}
801template <>
802EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
803 return ploadquad_hvx<HVXPacketSize::Full>(from);
804}
805template <>
806EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
807 return ploadquad_hvx<HVXPacketSize::Half>(from);
808}
809template <>
810EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
811 return ploadquad_hvx<HVXPacketSize::Quarter>(from);
812}
813
814template <>
815EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
816 HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
817 return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
818}
819
820template <>
821EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
822 HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
823 return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
824}
825
826template <>
827EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
828 HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
829 return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
830}
831
832template <HVXPacketSize T>
833EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
834 return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
835}
836template <>
837EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
838 return pmin_hvx(a, b);
839}
840template <>
841EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
842 return pmin_hvx(a, b);
843}
844template <>
845EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
846 return pmin_hvx(a, b);
847}
848
849template <HVXPacketSize T>
850EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
851 return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
852}
853template <>
854EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
855 return pmax_hvx(a, b);
856}
857template <>
858EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
859 return pmax_hvx(a, b);
860}
861template <>
862EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
863 return pmax_hvx(a, b);
864}
865
866template <HVXPacketSize T>
867EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
868 return HVXPacket<T>::Create(a.Get() & b.Get());
869}
870template <>
871EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
872 return pand_hvx(a, b);
873}
874template <>
875EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
876 return pand_hvx(a, b);
877}
878template <>
879EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
880 return pand_hvx(a, b);
881}
882
883template <HVXPacketSize T>
884EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
885 return HVXPacket<T>::Create(a.Get() | b.Get());
886}
887template <>
888EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
889 return por_hvx(a, b);
890}
891template <>
892EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
893 return por_hvx(a, b);
894}
895template <>
896EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
897 return por_hvx(a, b);
898}
899
900template <HVXPacketSize T>
901EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
902 return HVXPacket<T>::Create(a.Get() ^ b.Get());
903}
904template <>
905EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
906 return pxor_hvx(a, b);
907}
908template <>
909EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
910 return pxor_hvx(a, b);
911}
912template <>
913EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
914 return pxor_hvx(a, b);
915}
916
917template <HVXPacketSize T>
918EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
919 return HVXPacket<T>::Create(~a.Get());
920}
921template <>
922EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
923 return pnot_hvx(a);
924}
925template <>
926EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
927 return pnot_hvx(a);
928}
929template <>
930EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
931 return pnot_hvx(a);
932}
933
934template <HVXPacketSize T>
935EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
936 HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
937 return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
938}
939template <>
940EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
941 return pselect_hvx(mask, a, b);
942}
943template <>
944EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
945 return pselect_hvx(mask, a, b);
946}
947template <>
948EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
949 return pselect_hvx(mask, a, b);
950}
951
952template <HVXPacketSize T, typename Op>
953EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
954 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
955 HVXPacket<T> vredux = a;
956 for (int i = 1; i < packet_size; i <<= 1) {
957 vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
958 }
959 return pfirst(vredux);
960}
961
962template <>
963EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
964 return predux_generic(a, pmax<Packet32f>);
965}
966template <>
967EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
968 return predux_generic(a, pmax<Packet16f>);
969}
970template <>
971EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
972 return predux_generic(a, pmax<Packet8f>);
973}
974
975template <>
976EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
977 return predux_generic(a, pmin<Packet32f>);
978}
979template <>
980EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
981 return predux_generic(a, pmin<Packet16f>);
982}
983template <>
984EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
985 return predux_generic(a, pmin<Packet8f>);
986}
987
988template <>
989EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
990 return predux_generic(a, por<Packet32f>) != 0.0f;
991}
992template <>
993EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
994 return predux_generic(a, por<Packet16f>) != 0.0f;
995}
996template <>
997EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
998 return predux_generic(a, por<Packet8f>) != 0.0f;
999}
1000
1001static const float index_vsf[32]
1002 __attribute__((aligned(__HVX_LENGTH__))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1003 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
1004
1005template <HVXPacketSize T>
1006EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
1007 return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
1008}
1009template <>
1010EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
1011 return plset_hvx<HVXPacketSize::Full>(a);
1012}
1013template <>
1014EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
1015 return plset_hvx<HVXPacketSize::Half>(a);
1016}
1017template <>
1018EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
1019 return plset_hvx<HVXPacketSize::Quarter>(a);
1020}
1021
1022template <HVXPacketSize T>
1023EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
1024 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1025 float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1026 pstore<float>(elements, from);
1027 for (Index i = 0; i < packet_size; ++i) {
1028 to[i * stride] = elements[i];
1029 }
1030}
1031template <>
1032EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
1033 pscatter_hvx(to, from, stride);
1034}
1035template <>
1036EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
1037 pscatter_hvx(to, from, stride);
1038}
1039template <>
1040EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
1041 pscatter_hvx(to, from, stride);
1042}
1043
1044template <HVXPacketSize T>
1045EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
1046 const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
1047 float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
1048 for (Index i = 0; i < packet_size; i++) {
1049 elements[i] = from[i * stride];
1050 }
1051 return pload<HVXPacket<T>>(elements);
1052}
1053template <>
1054EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
1055 return pgather_hvx<HVXPacketSize::Full>(from, stride);
1056}
1057template <>
1058EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
1059 return pgather_hvx<HVXPacketSize::Half>(from, stride);
1060}
1061template <>
1062EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
1063 return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
1064}
1065
1066} // end namespace internal
1067} // end namespace Eigen
1068
1069#endif // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
1070
1071#endif // EIGEN_HVX_PACKET_MATH_H
@ Aligned64
Definition Constants.h:239
@ Aligned128
Definition Constants.h:240
@ Aligned32
Definition Constants.h:238
Namespace containing all symbols from the Eigen library.
Definition Core:137