Eigen  3.4.90 (git rev 5a9f66fb35d03a4da9ef8976e67a61b30aa16dcf)
 
Loading...
Searching...
No Matches
InverseSize4.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2001 Intel Corporation
5// Copyright (C) 2010 Gael Guennebaud <[email protected]>
6// Copyright (C) 2009 Benoit Jacob <[email protected]>
7//
8// This Source Code Form is subject to the terms of the Mozilla
9// Public License v. 2.0. If a copy of the MPL was not distributed
10// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
11//
12// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
13// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
14// adjugate of M and determinant of M respectively. M# is computed block-wise
15// using specific formulae. For proof, see:
16// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
17// Variable names are adopted from \src\LU\Inverse_SSE.h.
18//
19// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
20// comes from the following Intel's library:
21// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
22//
23// Here is the respective copyright and license statement:
24//
25// Copyright (c) 2001 Intel Corporation.
26//
27// Permition is granted to use, copy, distribute and prepare derivative works
28// of this library for any purpose and without fee, provided, that the above
29// copyright notice and this statement appear in all copies.
30// Intel makes no representations about the suitability of this software for
31// any purpose, and specifically disclaims all warranties.
32// See LEGAL.TXT for all the legal information.
33//
34// TODO: Unify implementations of different data types (i.e. float and double).
35#ifndef EIGEN_INVERSE_SIZE_4_H
36#define EIGEN_INVERSE_SIZE_4_H
37
38// IWYU pragma: private
39#include "../InternalHeaderCheck.h"
40
41#if EIGEN_COMP_GNUC_STRICT
42// These routines requires bit manipulation of the sign, which is not compatible
43// with fastmath.
44#pragma GCC push_options
45#pragma GCC optimize("no-fast-math")
46#endif
47
48namespace Eigen {
49namespace internal {
50template <typename MatrixType, typename ResultType>
51struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType> {
52 enum {
53 MatrixAlignment = traits<MatrixType>::Alignment,
54 ResultAlignment = traits<ResultType>::Alignment,
55 StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
56 };
57 typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &,
58 typename MatrixType::PlainObject>
59 ActualMatrixType;
60
61 static void run(const MatrixType &mat, ResultType &result) {
62 ActualMatrixType matrix(mat);
63
64 const float *data = matrix.data();
65 const Index stride = matrix.innerStride();
66 Packet4f L1 = ploadt<Packet4f, MatrixAlignment>(data);
67 Packet4f L2 = ploadt<Packet4f, MatrixAlignment>(data + stride * 4);
68 Packet4f L3 = ploadt<Packet4f, MatrixAlignment>(data + stride * 8);
69 Packet4f L4 = ploadt<Packet4f, MatrixAlignment>(data + stride * 12);
70
71 // Four 2x2 sub-matrices of the input matrix
72 // input = [[A, B],
73 // [C, D]]
74 Packet4f A, B, C, D;
75
76 if (!StorageOrdersMatch) {
77 A = vec4f_unpacklo(L1, L2);
78 B = vec4f_unpacklo(L3, L4);
79 C = vec4f_unpackhi(L1, L2);
80 D = vec4f_unpackhi(L3, L4);
81 } else {
82 A = vec4f_movelh(L1, L2);
83 B = vec4f_movehl(L2, L1);
84 C = vec4f_movelh(L3, L4);
85 D = vec4f_movehl(L4, L3);
86 }
87
88 Packet4f AB, DC;
89
90 // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
91 AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
92 AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
93
94 // DC = D#*C
95 DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
96 DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
97
98 // determinants of the sub-matrices
99 Packet4f dA, dB, dC, dD;
100
101 dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
102 dA = psub(dA, vec4f_movehl(dA, dA));
103
104 dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
105 dB = psub(dB, vec4f_movehl(dB, dB));
106
107 dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
108 dC = psub(dC, vec4f_movehl(dC, dC));
109
110 dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
111 dD = psub(dD, vec4f_movehl(dD, dD));
112
113 Packet4f d, d1, d2;
114
115 d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
116 d = padd(d, vec4f_movehl(d, d));
117 d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
118 d1 = pmul(dA, dD);
119 d2 = pmul(dB, dC);
120
121 // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
122 Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
123
124 // reciprocal of the determinant of the input matrix, rd = 1/det
125 Packet4f rd = preciprocal(det);
126
127 // Four sub-matrices of the inverse
128 Packet4f iA, iB, iC, iD;
129
130 // iD = D*|A| - C*A#*B
131 iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
132 iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
133 iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
134
135 // iA = A*|D| - B*D#*C
136 iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
137 iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
138 iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
139
140 // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
141 iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
142 iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
143 iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
144
145 // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
146 iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
147 iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
148 iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
149
150 EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
151 const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
152 rd = pxor(rd, p4f_sign_PNNP);
153 iA = pmul(iA, rd);
154 iB = pmul(iB, rd);
155 iC = pmul(iC, rd);
156 iD = pmul(iD, rd);
157
158 Index res_stride = result.outerStride();
159 float *res = result.data();
160
161 pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
162 pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
163 pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
164 pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
165 }
166};
167
168#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
169// same algorithm as above, except that each operand is split into
170// halves for two registers to hold.
171template <typename MatrixType, typename ResultType>
172struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType> {
173 enum {
174 MatrixAlignment = traits<MatrixType>::Alignment,
175 ResultAlignment = traits<ResultType>::Alignment,
176 StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
177 };
178 typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &,
179 typename MatrixType::PlainObject>
180 ActualMatrixType;
181
182 static void run(const MatrixType &mat, ResultType &result) {
183 ActualMatrixType matrix(mat);
184
185 // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
186 // row e.g. A1, upper row of A, A2, lower row of A
187 // input = [[A, B], = [[[A1, [B1,
188 // [C, D]] A2], B2]],
189 // [[C1, [D1,
190 // C2], D2]]]
191
192 Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
193
194 const double *data = matrix.data();
195 const Index stride = matrix.innerStride();
196 if (StorageOrdersMatch) {
197 A1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 0);
198 B1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 2);
199 A2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 4);
200 B2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 6);
201 C1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 8);
202 D1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 10);
203 C2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 12);
204 D2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 14);
205 } else {
206 Packet2d temp;
207 A1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 0);
208 C1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 2);
209 A2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 4);
210 C2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 6);
211 temp = A1;
212 A1 = vec2d_unpacklo(A1, A2);
213 A2 = vec2d_unpackhi(temp, A2);
214
215 temp = C1;
216 C1 = vec2d_unpacklo(C1, C2);
217 C2 = vec2d_unpackhi(temp, C2);
218
219 B1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 8);
220 D1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 10);
221 B2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 12);
222 D2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 14);
223
224 temp = B1;
225 B1 = vec2d_unpacklo(B1, B2);
226 B2 = vec2d_unpackhi(temp, B2);
227
228 temp = D1;
229 D1 = vec2d_unpacklo(D1, D2);
230 D2 = vec2d_unpackhi(temp, D2);
231 }
232
233 // determinants of the sub-matrices
234 Packet2d dA, dB, dC, dD;
235
236 dA = vec2d_swizzle2(A2, A2, 1);
237 dA = pmul(A1, dA);
238 dA = psub(dA, vec2d_duplane(dA, 1));
239
240 dB = vec2d_swizzle2(B2, B2, 1);
241 dB = pmul(B1, dB);
242 dB = psub(dB, vec2d_duplane(dB, 1));
243
244 dC = vec2d_swizzle2(C2, C2, 1);
245 dC = pmul(C1, dC);
246 dC = psub(dC, vec2d_duplane(dC, 1));
247
248 dD = vec2d_swizzle2(D2, D2, 1);
249 dD = pmul(D1, dD);
250 dD = psub(dD, vec2d_duplane(dD, 1));
251
252 Packet2d DC1, DC2, AB1, AB2;
253
254 // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
255 AB1 = pmul(B1, vec2d_duplane(A2, 1));
256 AB2 = pmul(B2, vec2d_duplane(A1, 0));
257 AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
258 AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
259
260 // DC = D#*C
261 DC1 = pmul(C1, vec2d_duplane(D2, 1));
262 DC2 = pmul(C2, vec2d_duplane(D1, 0));
263 DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
264 DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
265
266 Packet2d d1, d2;
267
268 // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
269 Packet2d det;
270
271 // reciprocal of the determinant of the input matrix, rd = 1/det
272 Packet2d rd;
273
274 d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
275 d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
276 rd = padd(d1, d2);
277 rd = padd(rd, vec2d_duplane(rd, 1));
278
279 d1 = pmul(dA, dD);
280 d2 = pmul(dB, dC);
281
282 det = padd(d1, d2);
283 det = psub(det, rd);
284 det = vec2d_duplane(det, 0);
285 rd = pdiv(pset1<Packet2d>(1.0), det);
286
287 // rows of four sub-matrices of the inverse
288 Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
289
290 // iD = D*|A| - C*A#*B
291 iD1 = pmul(AB1, vec2d_duplane(C1, 0));
292 iD2 = pmul(AB1, vec2d_duplane(C2, 0));
293 iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
294 iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
295 dA = vec2d_duplane(dA, 0);
296 iD1 = psub(pmul(D1, dA), iD1);
297 iD2 = psub(pmul(D2, dA), iD2);
298
299 // iA = A*|D| - B*D#*C
300 iA1 = pmul(DC1, vec2d_duplane(B1, 0));
301 iA2 = pmul(DC1, vec2d_duplane(B2, 0));
302 iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
303 iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
304 dD = vec2d_duplane(dD, 0);
305 iA1 = psub(pmul(A1, dD), iA1);
306 iA2 = psub(pmul(A2, dD), iA2);
307
308 // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
309 iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
310 iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
311 iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
312 iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
313 dB = vec2d_duplane(dB, 0);
314 iB1 = psub(pmul(C1, dB), iB1);
315 iB2 = psub(pmul(C2, dB), iB2);
316
317 // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
318 iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
319 iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
320 iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
321 iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
322 dC = vec2d_duplane(dC, 0);
323 iC1 = psub(pmul(B1, dC), iC1);
324 iC2 = psub(pmul(B2, dC), iC2);
325
326 EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};
327 EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};
328 const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
329 const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
330 d1 = pxor(rd, sign_PN);
331 d2 = pxor(rd, sign_NP);
332
333 Index res_stride = result.outerStride();
334 double *res = result.data();
335 pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
336 pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
337 pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
338 pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
339 pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
340 pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
341 pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
342 pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
343 }
344};
345#endif
346} // namespace internal
347} // namespace Eigen
348
349#if EIGEN_COMP_GNUC_STRICT
350#pragma GCC pop_options
351#endif
352
353#endif
const unsigned int LinearAccessBit
Definition Constants.h:133
const unsigned int RowMajorBit
Definition Constants.h:70
Namespace containing all symbols from the Eigen library.
Definition Core:137