Eigen  3.4.90 (git rev 5a9f66fb35d03a4da9ef8976e67a61b30aa16dcf)
 
Loading...
Searching...
No Matches
GeneralBlockPanelKernel.h
1// IWYU pragma: private
2#include "../../InternalHeaderCheck.h"
3
4namespace Eigen {
5namespace internal {
6
7#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
8
9// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
10// Here we specialize gebp_traits to eliminate these register spills.
11// See #2138.
12template <>
13struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
14 : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
15 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
16 // This volatile inline ASM both acts as a barrier to prevent reordering,
17 // as well as enforces strict register use.
18 asm volatile("vmla.f32 %q[r], %q[c], %q[alpha]" : [r] "+w"(r) : [c] "w"(c), [alpha] "w"(alpha) :);
19 }
20
21 template <typename LaneIdType>
22 EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, Packet4f& c, Packet4f&, const LaneIdType&) const {
23 acc(a, b, c);
24 }
25
26 template <typename LaneIdType>
27 EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b, Packet4f& c, Packet4f& tmp,
28 const LaneIdType& lane) const {
29 madd(a, b.get(lane), c, tmp, lane);
30 }
31};
32
33#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
34
35#if EIGEN_ARCH_ARM64
36
37#ifndef EIGEN_NEON_GEBP_NR
38#define EIGEN_NEON_GEBP_NR 8
39#endif
40
41template <>
42struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
43 : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
44 typedef float RhsPacket;
45 typedef float32x4_t RhsPacketx4;
46 enum { nr = EIGEN_NEON_GEBP_NR };
47 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
48
49 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1q_f32(b); }
50
51 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
52
53 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
54
55 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
56
57 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
58 const FixedInt<0>&) const {
59 c = vfmaq_n_f32(c, a, b);
60 }
61 // NOTE: Template parameter inference failed when compiled with Android NDK:
62 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
63
64 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
65 const FixedInt<0>&) const {
66 madd_helper<0>(a, b, c);
67 }
68 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
69 const FixedInt<1>&) const {
70 madd_helper<1>(a, b, c);
71 }
72 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
73 const FixedInt<2>&) const {
74 madd_helper<2>(a, b, c);
75 }
76 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
77 const FixedInt<3>&) const {
78 madd_helper<3>(a, b, c);
79 }
80
81 private:
82 template <int LaneID>
83 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
84#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
85 // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
86 // vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
87 // 2. workaround the gcc register split problem on arm64-neon
88 if (LaneID == 0)
89 asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w"(c) : "w"(a), "w"(b) :);
90 else if (LaneID == 1)
91 asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w"(c) : "w"(a), "w"(b) :);
92 else if (LaneID == 2)
93 asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w"(c) : "w"(a), "w"(b) :);
94 else if (LaneID == 3)
95 asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w"(c) : "w"(a), "w"(b) :);
96#else
97 c = vfmaq_laneq_f32(c, a, b, LaneID);
98#endif
99 }
100};
101
102template <>
103struct gebp_traits<double, double, false, false, Architecture::NEON>
104 : gebp_traits<double, double, false, false, Architecture::Generic> {
105 typedef double RhsPacket;
106 enum { nr = EIGEN_NEON_GEBP_NR };
107 struct RhsPacketx4 {
108 float64x2_t B_0, B_1;
109 };
110
111 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
112
113 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
114 dest.B_0 = vld1q_f64(b);
115 dest.B_1 = vld1q_f64(b + 2);
116 }
117
118 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
119
120 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
121
122 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
123
124 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
125 const FixedInt<0>&) const {
126 c = vfmaq_n_f64(c, a, b);
127 }
128
129 // NOTE: Template parameter inference failed when compiled with Android NDK:
130 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
131
132 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
133 const FixedInt<0>&) const {
134 madd_helper<0>(a, b, c);
135 }
136 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
137 const FixedInt<1>&) const {
138 madd_helper<1>(a, b, c);
139 }
140 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
141 const FixedInt<2>&) const {
142 madd_helper<2>(a, b, c);
143 }
144 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
145 const FixedInt<3>&) const {
146 madd_helper<3>(a, b, c);
147 }
148
149 private:
150 template <int LaneID>
151 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
152#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
153 // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
154 // vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
155 // 2. workaround the gcc register split problem on arm64-neon
156 if (LaneID == 0)
157 asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
158 else if (LaneID == 1)
159 asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
160 else if (LaneID == 2)
161 asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
162 else if (LaneID == 3)
163 asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
164#else
165 if (LaneID == 0)
166 c = vfmaq_laneq_f64(c, a, b.B_0, 0);
167 else if (LaneID == 1)
168 c = vfmaq_laneq_f64(c, a, b.B_0, 1);
169 else if (LaneID == 2)
170 c = vfmaq_laneq_f64(c, a, b.B_1, 0);
171 else if (LaneID == 3)
172 c = vfmaq_laneq_f64(c, a, b.B_1, 1);
173#endif
174 }
175};
176
177// The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not
178// allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]',
179// so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented
180// through a costly dup in gcc compiler.
181#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
182
183template <>
184struct gebp_traits<half, half, false, false, Architecture::NEON>
185 : gebp_traits<half, half, false, false, Architecture::Generic> {
186 typedef half RhsPacket;
187 typedef float16x4_t RhsPacketx4;
188 typedef float16x4_t PacketHalf;
189 enum { nr = EIGEN_NEON_GEBP_NR };
190
191 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
192
193 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1_f16((const __fp16*)b); }
194
195 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
196
197 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
198
199 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const {
200 // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
201 // using a single scalar value.
202 eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
203 }
204
205 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
206 const FixedInt<0>&) const {
207 c = vfmaq_n_f16(c, a, b);
208 }
209 EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/,
210 const FixedInt<0>&) const {
211 c = vfma_n_f16(c, a, b);
212 }
213
214 // NOTE: Template parameter inference failed when compiled with Android NDK:
215 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
216 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
217 const FixedInt<0>&) const {
218 madd_helper<0>(a, b, c);
219 }
220 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
221 const FixedInt<1>&) const {
222 madd_helper<1>(a, b, c);
223 }
224 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
225 const FixedInt<2>&) const {
226 madd_helper<2>(a, b, c);
227 }
228 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
229 const FixedInt<3>&) const {
230 madd_helper<3>(a, b, c);
231 }
232
233 private:
234 template <int LaneID>
235 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
236 c = vfmaq_lane_f16(c, a, b, LaneID);
237 }
238};
239#endif // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
240#endif // EIGEN_ARCH_ARM64
241
242} // namespace internal
243} // namespace Eigen
Namespace containing all symbols from the Eigen library.
Definition Core:137