14 inline void amp_gemm(
const Mat &A,
const Mat &B,
float alpha, Mat &res)
16 DGM_ASSERT(A.cols == B.rows);
17 if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
18 DGM_ASSERT(res.rows == A.rows);
19 DGM_ASSERT(res.cols == B.cols);
21 concurrency::array_view<const float, 2> a(A.rows, A.cols, reinterpret_cast<float * const>(A.data));
22 concurrency::array_view<const float, 2> b(B.rows, B.cols, reinterpret_cast<float * const>(B.data));
23 concurrency::array_view<float, 2> r(res.rows, res.cols, reinterpret_cast<float *> (res.data));
24 concurrency::parallel_for_each(r.extent, [=](concurrency::index<2> idx) restrict(amp) {
28 for (
int k = 0; k < a.extent[1]; k++)
29 sum += a(y, k) * b(k, x);
35 inline void amp_gemm(
const Mat &A,
const Mat &B,
float alpha,
const Mat &C,
float beta, Mat &res)
37 DGM_ASSERT(A.cols == B.rows);
38 if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
39 DGM_ASSERT(res.rows == A.rows && res.rows == C.rows);
40 DGM_ASSERT(res.cols == B.cols && res.cols == C.cols);
42 concurrency::array_view<const float, 2> a(A.rows, A.cols, reinterpret_cast<float * const>(A.data));
43 concurrency::array_view<const float, 2> b(B.rows, B.cols, reinterpret_cast<float * const>(B.data));
44 concurrency::array_view<const float, 2> c(C.rows, C.cols, reinterpret_cast<float * const>(C.data));
45 concurrency::array_view<float, 2> r(res.rows, res.cols, reinterpret_cast<float *> (res.data));
46 concurrency::parallel_for_each(r.extent, [=](concurrency::index<2> idx) restrict(amp) {
50 for (
int k = 0; k < a.extent[1]; k++)
51 sum += a(y, k) * b(k, x);
52 r[idx] = alpha * sum + beta * c[idx];
58 inline void ppl_gemm(
const Mat &A,
const Mat &B,
float alpha, Mat &res)
60 DGM_ASSERT(A.cols == B.rows);
61 if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
62 DGM_ASSERT(res.rows == A.rows);
63 DGM_ASSERT(res.cols == B.cols);
66 concurrency::parallel_for(0, res.rows, [&](
int y) {
67 float * pRes = res.ptr<float>(y);
68 const float * pA = A.ptr<float>(y);
69 for (int x = 0; x < res.cols; x++) {
70 const float * pB = _B.ptr<float>(x);
72 for (int k = 0; k < A.cols; k++)
74 pRes[x] = alpha * sum;
79 inline void ppl_gemm(
const Mat &A,
const Mat &B,
float alpha,
const Mat &C,
float beta, Mat &res)
81 DGM_ASSERT(A.cols == B.rows);
82 if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
83 DGM_ASSERT(res.rows == A.rows && res.rows == C.rows);
84 DGM_ASSERT(res.cols == B.cols && res.cols == C.cols);
87 concurrency::parallel_for(0, res.rows, [&](
int y) {
88 float * pRes = res.ptr<float>(y);
89 const float * pA = A.ptr<float>(y);
90 const float * pC = C.ptr<float>(y);
91 for (int x = 0; x < res.cols; x++) {
92 const float * pB = _B.ptr<float>(x);
94 for (int k = 0; k < A.cols; k++)
96 pRes[x] = alpha * sum + beta * pC[x];
112 DllExport
inline void gemm(
const Mat &A,
const Mat &B,
float alpha,
const Mat &C,
float beta, Mat &res)
115 if (C.empty()) impl::amp_gemm(A, B, alpha, res);
116 else impl::amp_gemm(A, B, alpha, C, beta, res);
119 if (C.empty()) impl::ppl_gemm(A, B, alpha, res);
120 else impl::ppl_gemm(A, B, alpha, C, beta, res);
122 cv::gemm(A, B, alpha, C, beta, res);
131 inline void Swap(Mat &a, Mat &b, Mat &tmp = EmptyMat)
138 template <
typename T>
139 inline void insertion_sort(Mat &m,
int x,
int begin,
int end)
142 for (
int i = begin; i <= end; i++) {
144 while (j > begin && m.at<T>(j, x) < m.at<T>(j - 1, x)) {
145 Swap(lvalue_cast(m.row(j)), lvalue_cast(m.row(j - 1)), tmp);
151 template <
typename T>
152 inline void sequential_quick_sort(Mat &m,
int x,
int begin,
int end,
int threshold)
154 if (end - begin < threshold) insertion_sort<T>(m, x, begin, end);
158 T pivot = m.at<T>((begin + end) / 2, x);
161 while (_begin <= _end) {
162 while (m.at<T>(_begin, x) < pivot) _begin++;
163 while (m.at<T>(_end, x) > pivot) _end--;
164 if (_begin <= _end) {
165 Swap(lvalue_cast(m.row(_begin)), lvalue_cast(m.row(_end)));
172 if (begin < _end) sequential_quick_sort<T>(m, x, begin, _end, threshold);
173 if (_begin < end) sequential_quick_sort<T>(m, x, _begin, end, threshold);
178 template <
typename T>
179 inline void parallel_quick_sort(Mat &m,
int x,
int begin,
int end,
int threshold,
int depthRemaining)
181 if (end - begin < threshold) insertion_sort<T>(m, x, begin, end);
185 T pivot = m.at<T>((begin + end) / 2, x);
188 while (_begin <= _end) {
189 while (m.at<T>(_begin, x) < pivot) _begin++;
190 while (m.at<T>(_end, x) > pivot) _end--;
191 if (_begin <= _end) {
192 Swap(m.row(_begin), m.row(_end));
199 if (depthRemaining > 0)
200 concurrency::parallel_invoke(
201 [&, x, begin, _end] {
if (begin < _end) parallel_quick_sort<T>(m, x, begin, _end, threshold, depthRemaining - 1); },
202 [&, x, end, _begin] {
if (_begin < end) parallel_quick_sort<T>(m, x, _begin, end, threshold, depthRemaining - 1); }
205 if (begin < _end) sequential_quick_sort<T>(m, x, begin, _end, threshold);
206 if (_begin < end) sequential_quick_sort<T>(m, x, _begin, end, threshold);
221 template <
typename T>
224 DGM_ASSERT(x < m.cols);
226 const int nCores = MAX(1, concurrency::CurrentScheduler::Get()->GetNumberOfVirtualProcessors());
227 parallel_quick_sort<T>(m, x, 0, m.rows - 1, 200,
static_cast<int>(log2f(
float(nCores))) + 4);
229 sequential_quick_sort<T>(m, x, 0, m.rows - 1, 200);
234 template <
typename T>
235 inline void deepSort(Mat &m,
int depth,
int begin,
int end)
237 if (depth == m.cols)
return;
238 if (begin == end)
return;
241 const int nCores = MAX(1, concurrency::CurrentScheduler::Get()->GetNumberOfVirtualProcessors());
242 parallel_quick_sort<T>(m, depth, begin, end, 200,
static_cast<int>(log2f(
float(nCores))) + 4);
244 sequential_quick_sort<T>(m, depth, begin, end, 200);
248 T ref_val = m.at<T>(begin, depth);
249 for (
int y = begin + 1; y <= end; y++) {
250 T val = m.at<T>(y, depth);
251 if (val != ref_val) {
252 deepSort<T>(m, depth + 1, ref_pos, y - 1);
267 template <
typename T>
270 deepSort<T>(m, 0, 0, m.rows - 1);
286 int nCores = MAX(1, concurrency::CurrentScheduler::Get()->GetNumberOfVirtualProcessors());
287 int step = MAX(2, m.rows / (nCores * 10));
288 concurrency::parallel_for(0, m.rows, step, [step, &m](
int S) {
290 int last = MIN(S + step, m.rows);
291 for (int s = last - 1; s > S; s--) {
292 dword r = DirectGraphicalModels::random::u<dword>(S, s);
293 if (r != s) Swap(m.row(s), m.row(r), tmp);
298 for (
int s = m.rows - 1; s > 0; s--) {
299 int r = random::u<int>(0, s);
300 if (r != s) Swap(lvalue_cast(m.row(s)), lvalue_cast(m.row(r)), tmp);
void shuffleRows(Mat &m)
Randomly shuffles the rows of the input matrix.
void gemm(const Mat &A, const Mat &B, float alpha, const Mat &C, float beta, Mat &res)
Fast generalized matrix multiplication.
void sortRows(Mat &m, int x)
Sorts the rows of the input matrix by the given dimension.