eigen-docs/CoreThreadPoolDevice_8h_source.html

// This file is part of Eigen, a lightweight C++ template library

// for linear algebra.

//

// Copyright (C) 2023 Charlie Schlosser <[email protected]>

//

// This Source Code Form is subject to the terms of the Mozilla

// Public License v. 2.0. If a copy of the MPL was not distributed

// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H

#define EIGEN_CORE_THREAD_POOL_DEVICE_H


namespace Eigen {


// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with

// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for

// delegating the task to a thread.


//                a

//               / \

//              /   \

//             /     \

//            /       \

//           /         \

//          /           \

//         /             \

//        a               e

//       / \             / \

//      /   \           /   \

//     /     \         /     \

//    a       c       e       g

//   / \     / \     / \     / \

//  /   \   /   \   /   \   /   \

// a     b c     d e     f g     h


// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the

// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number

// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The

// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the

// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.


struct CoreThreadPoolDevice {

  using Task = std::function<void()>;

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)

      : m_pool(pool) {

    eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");

    m_costFactor = threadCostThreshold;

  }


  template <int PacketSize>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {

    eigen_assert(cost >= 0.0f && "cost must be non-negative");

    Index numOps = size / PacketSize;

    int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();

    float totalCost = static_cast<float>(numOps) * cost;

    float idealThreads = totalCost * m_costFactor;

    if (idealThreads < static_cast<float>(actualThreads)) {

      idealThreads = numext::maxi(idealThreads, 1.0f);

      actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));

    }

    int maxLevel = internal::log2_ceil(actualThreads);

    return maxLevel;

  }


// MSVC does not like inlining parallelForImpl

#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG

#define EIGEN_PARALLEL_FOR_INLINE

#else

#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE

#endif


  template <typename UnaryFunctor, int PacketSize>

  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,

                                                                   Barrier& barrier, int level) {

    while (level > 0) {

      level--;

      Index size = end - begin;

      eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");

      Index mid = begin + numext::round_down(size >> 1, PacketSize);

      Task right = [this, mid, end, &f, &barrier, level]() {

        parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);

      };

      m_pool.Schedule(std::move(right));

      end = mid;

    }

    for (Index i = begin; i < end; i += PacketSize) f(i);

    barrier.Notify();

  }


  template <typename BinaryFunctor, int PacketSize>

  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,

                                                                   Index innerEnd, BinaryFunctor& f, Barrier& barrier,

                                                                   int level) {

    while (level > 0) {

      level--;

      Index outerSize = outerEnd - outerBegin;

      if (outerSize > 1) {

        Index outerMid = outerBegin + (outerSize >> 1);

        Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {

          parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);

        };

        m_pool.Schedule(std::move(right));

        outerEnd = outerMid;

      } else {

        Index innerSize = innerEnd - innerBegin;

        eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");

        Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);

        Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {

          parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);

        };

        m_pool.Schedule(std::move(right));

        innerEnd = innerMid;

      }

    }

    for (Index outer = outerBegin; outer < outerEnd; outer++)

      for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);

    barrier.Notify();

  }


#undef EIGEN_PARALLEL_FOR_INLINE


  template <typename UnaryFunctor, int PacketSize>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {

    Index size = end - begin;

    int maxLevel = calculateLevels<PacketSize>(size, cost);

    Barrier barrier(1 << maxLevel);

    parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);

    barrier.Wait();

  }


  template <typename BinaryFunctor, int PacketSize>

  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,

                                                         Index innerEnd, BinaryFunctor& f, float cost) {

    Index outerSize = outerEnd - outerBegin;

    Index innerSize = innerEnd - innerBegin;

    Index size = outerSize * innerSize;

    int maxLevel = calculateLevels<PacketSize>(size, cost);

    Barrier barrier(1 << maxLevel);

    parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);

    barrier.Wait();

  }


  ThreadPool& m_pool;

  // costFactor is the cost of delegating a task to a thread

  // the inverse is used to avoid a floating point division

  float m_costFactor;

};


// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice


namespace internal {


template <typename Kernel>

struct cost_helper {

  using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;

  using DstEvaluatorType = typename Kernel::DstEvaluatorType;

  using SrcXprType = typename SrcEvaluatorType::XprType;

  using DstXprType = typename DstEvaluatorType::XprType;

  static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;

  struct AssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {

      this->assignCoeffByOuterInner(outer, inner);

    }

  };


  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index innerSize = kernel.innerSize();

    const Index outerSize = kernel.outerSize();

    constexpr float cost = static_cast<float>(XprEvaluationCost);

    AssignmentFunctor functor(kernel);

    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);

  }

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {

  using DstXprType = typename Kernel::DstEvaluatorType::XprType;

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;

  struct AssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {

      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);

    }

  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index outerSize = kernel.outerSize();

    AssignmentFunctor functor(kernel);

    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);

    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);

  }

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {

  using PacketType = typename Kernel::PacketType;

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,

                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,

                         DstAlignment = Kernel::AssignmentTraits::DstAlignment;

  struct AssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {

      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);

    }

  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index innerSize = kernel.innerSize();

    const Index outerSize = kernel.outerSize();

    const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);

    AssignmentFunctor functor(kernel);

    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);

  }

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {

  using PacketType = typename Kernel::PacketType;

  using DstXprType = typename Kernel::DstEvaluatorType::XprType;

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,

                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,

                         DstAlignment = Kernel::AssignmentTraits::DstAlignment,

                         InnerSize = DstXprType::InnerSizeAtCompileTime;

  struct AssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {

      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);

    }

  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index outerSize = kernel.outerSize();

    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);

    AssignmentFunctor functor(kernel);

    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);

  }

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {

  using Scalar = typename Kernel::Scalar;

  using PacketType = typename Kernel::PacketType;

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;

  struct PacketAssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {

      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);

    }

  };

  struct ScalarAssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {

      const Index innerSize = this->innerSize();

      const Index packetAccessSize = numext::round_down(innerSize, PacketSize);

      for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);

    }

  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index outerSize = kernel.outerSize();

    const Index innerSize = kernel.innerSize();

    const Index packetAccessSize = numext::round_down(innerSize, PacketSize);

    constexpr float packetCost = static_cast<float>(XprEvaluationCost);

    const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);

    PacketAssignmentFunctor packetFunctor(kernel);

    ScalarAssignmentFunctor scalarFunctor(kernel);

    device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,

                                                                     packetCost);

    device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);

  };

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;

  struct AssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }

  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index size = kernel.size();

    constexpr float cost = static_cast<float>(XprEvaluationCost);

    AssignmentFunctor functor(kernel);

    device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);

  }

};


template <typename Kernel>

struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {

  using Scalar = typename Kernel::Scalar;

  using PacketType = typename Kernel::PacketType;

  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,

                         RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,

                         PacketSize = unpacket_traits<PacketType>::size,

                         DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,

                         DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment

                                                                               : Kernel::AssignmentTraits::DstAlignment,

                         SrcAlignment = Kernel::AssignmentTraits::JointAlignment;

  struct AssignmentFunctor : public Kernel {

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}

    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {

      this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);

    }

  };

  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {

    const Index size = kernel.size();

    const Index alignedStart =

        DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);

    const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);


    unaligned_dense_assignment_loop<DstIsAligned != 0>::run(kernel, 0, alignedStart);


    constexpr float cost = static_cast<float>(XprEvaluationCost);

    AssignmentFunctor functor(kernel);

    device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);


    unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size);

  }

};


}  // namespace internal


}  // namespace Eigen


#endif  // EIGEN_CORE_THREAD_POOL_DEVICE_H

Eigen
Namespace containing all symbols from the Eigen library.
Definition Core:137