Direct Graphical Models  v.1.7.0
parallel.h
1 // Written by Sergey Kosov in 2016 - 2017 for Project X
2 #pragma once
3 
4 #include "types.h"
5 #include "macroses.h"
6 #include "random.h"
7 
8 namespace DirectGraphicalModels { namespace parallel {
9 // ------------------------------------------- GEMM ------------------------------------------
10 // --------------------- fast generalized matrix multiplication with PPL ---------------------
12  namespace impl {
13 #ifdef ENABLE_AMP
14  inline void amp_gemm(const Mat &A, const Mat &B, float alpha, Mat &res)
15  {
16  DGM_ASSERT(A.cols == B.rows);
17  if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
18  DGM_ASSERT(res.rows == A.rows);
19  DGM_ASSERT(res.cols == B.cols);
20 
21  concurrency::array_view<const float, 2> a(A.rows, A.cols, reinterpret_cast<float * const>(A.data));
22  concurrency::array_view<const float, 2> b(B.rows, B.cols, reinterpret_cast<float * const>(B.data));
23  concurrency::array_view<float, 2> r(res.rows, res.cols, reinterpret_cast<float *> (res.data));
24  concurrency::parallel_for_each(r.extent, [=](concurrency::index<2> idx) restrict(amp) {
25  int y = idx[0];
26  int x = idx[1];
27  float sum = 0.0f;
28  for (int k = 0; k < a.extent[1]; k++)
29  sum += a(y, k) * b(k, x);
30  r[idx] = alpha * sum;
31  });
32  r.synchronize();
33  }
34 
35  inline void amp_gemm(const Mat &A, const Mat &B, float alpha, const Mat &C, float beta, Mat &res)
36  {
37  DGM_ASSERT(A.cols == B.rows);
38  if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
39  DGM_ASSERT(res.rows == A.rows && res.rows == C.rows);
40  DGM_ASSERT(res.cols == B.cols && res.cols == C.cols);
41 
42  concurrency::array_view<const float, 2> a(A.rows, A.cols, reinterpret_cast<float * const>(A.data));
43  concurrency::array_view<const float, 2> b(B.rows, B.cols, reinterpret_cast<float * const>(B.data));
44  concurrency::array_view<const float, 2> c(C.rows, C.cols, reinterpret_cast<float * const>(C.data));
45  concurrency::array_view<float, 2> r(res.rows, res.cols, reinterpret_cast<float *> (res.data));
46  concurrency::parallel_for_each(r.extent, [=](concurrency::index<2> idx) restrict(amp) {
47  int y = idx[0];
48  int x = idx[1];
49  float sum = 0.0f;
50  for (int k = 0; k < a.extent[1]; k++)
51  sum += a(y, k) * b(k, x);
52  r[idx] = alpha * sum + beta * c[idx];
53  });
54  r.synchronize();
55  }
56 #endif
57 #ifdef ENABLE_PPL
58  inline void ppl_gemm(const Mat &A, const Mat &B, float alpha, Mat &res)
59  {
60  DGM_ASSERT(A.cols == B.rows);
61  if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
62  DGM_ASSERT(res.rows == A.rows);
63  DGM_ASSERT(res.cols == B.cols);
64 
65  const Mat _B = B.t();
66  concurrency::parallel_for(0, res.rows, [&](int y) {
67  float * pRes = res.ptr<float>(y);
68  const float * pA = A.ptr<float>(y);
69  for (int x = 0; x < res.cols; x++) {
70  const float * pB = _B.ptr<float>(x);
71  float sum = 0.0f;
72  for (int k = 0; k < A.cols; k++)
73  sum += pA[k] * pB[k];
74  pRes[x] = alpha * sum;
75  }
76  });
77  }
78 
79  inline void ppl_gemm(const Mat &A, const Mat &B, float alpha, const Mat &C, float beta, Mat &res)
80  {
81  DGM_ASSERT(A.cols == B.rows);
82  if (res.empty()) res = Mat(A.rows, B.cols, CV_32FC1);
83  DGM_ASSERT(res.rows == A.rows && res.rows == C.rows);
84  DGM_ASSERT(res.cols == B.cols && res.cols == C.cols);
85 
86  const Mat _B = B.t();
87  concurrency::parallel_for(0, res.rows, [&](int y) {
88  float * pRes = res.ptr<float>(y);
89  const float * pA = A.ptr<float>(y);
90  const float * pC = C.ptr<float>(y);
91  for (int x = 0; x < res.cols; x++) {
92  const float * pB = _B.ptr<float>(x);
93  float sum = 0.0f;
94  for (int k = 0; k < A.cols; k++)
95  sum += pA[k] * pB[k];
96  pRes[x] = alpha * sum + beta * pC[x];
97  }
98  });
99  }
100 #endif
101  }
103 
112  DllExport inline void gemm(const Mat &A, const Mat &B, float alpha, const Mat &C, float beta, Mat &res)
113  {
114 #ifdef ENABLE_AMP
115  if (C.empty()) impl::amp_gemm(A, B, alpha, res);
116  else impl::amp_gemm(A, B, alpha, C, beta, res);
117 #else
118 #ifdef ENABLE_PPL
119  if (C.empty()) impl::ppl_gemm(A, B, alpha, res);
120  else impl::ppl_gemm(A, B, alpha, C, beta, res);
121 #else
122  cv::gemm(A, B, alpha, C, beta, res);
123 #endif
124 #endif
125  }
126 
127 
128  // -------------------------------------------- SORT -------------------------------------------
129  // --------------------------- fast sorting of Mat elements with PPL --------------------------
130  namespace {
131  inline void Swap(Mat &a, Mat &b, Mat &tmp = EmptyMat)
132  {
133  a.copyTo(tmp);
134  b.copyTo(a);
135  tmp.copyTo(b);
136  }
137 
138  template <typename T>
139  inline void insertion_sort(Mat &m, int x, int begin, int end)
140  {
141  Mat tmp;
142  for (int i = begin; i <= end; i++) {
143  int j = i;
144  while (j > begin && m.at<T>(j, x) < m.at<T>(j - 1, x)) {
145  Swap(lvalue_cast(m.row(j)), lvalue_cast(m.row(j - 1)), tmp);
146  j--;
147  }
148  }
149  }
150 
151  template <typename T>
152  inline void sequential_quick_sort(Mat &m, int x, int begin, int end, int threshold)
153  {
154  if (end - begin < threshold) insertion_sort<T>(m, x, begin, end);
155  else {
156  int _begin = begin;
157  int _end = end;
158  T pivot = m.at<T>((begin + end) / 2, x);
159 
160  // partition
161  while (_begin <= _end) {
162  while (m.at<T>(_begin, x) < pivot) _begin++;
163  while (m.at<T>(_end, x) > pivot) _end--;
164  if (_begin <= _end) {
165  Swap(lvalue_cast(m.row(_begin)), lvalue_cast(m.row(_end)));
166  _begin++;
167  _end--;
168  }
169  };
170 
171  // recursion
172  if (begin < _end) sequential_quick_sort<T>(m, x, begin, _end, threshold);
173  if (_begin < end) sequential_quick_sort<T>(m, x, _begin, end, threshold);
174  }
175  }
176 
177 #ifdef ENABLE_PPL
178  template <typename T>
179  inline void parallel_quick_sort(Mat &m, int x, int begin, int end, int threshold, int depthRemaining)
180  {
181  if (end - begin < threshold) insertion_sort<T>(m, x, begin, end);
182  else {
183  int _begin = begin;
184  int _end = end;
185  T pivot = m.at<T>((begin + end) / 2, x);
186 
187  // partition
188  while (_begin <= _end) {
189  while (m.at<T>(_begin, x) < pivot) _begin++;
190  while (m.at<T>(_end, x) > pivot) _end--;
191  if (_begin <= _end) {
192  Swap(m.row(_begin), m.row(_end));
193  _begin++;
194  _end--;
195  }
196  };
197 
198  // recursion
199  if (depthRemaining > 0)
200  concurrency::parallel_invoke(
201  [&, x, begin, _end] { if (begin < _end) parallel_quick_sort<T>(m, x, begin, _end, threshold, depthRemaining - 1); },
202  [&, x, end, _begin] { if (_begin < end) parallel_quick_sort<T>(m, x, _begin, end, threshold, depthRemaining - 1); }
203  );
204  else {
205  if (begin < _end) sequential_quick_sort<T>(m, x, begin, _end, threshold);
206  if (_begin < end) sequential_quick_sort<T>(m, x, _begin, end, threshold);
207  }
208  }
209  }
210 #endif
211  }
212 
221  template <typename T>
222  DllExport inline void sortRows(Mat &m, int x)
223  {
224  DGM_ASSERT(x < m.cols);
225 #ifdef ENABLE_PPL
226  const int nCores = MAX(1, concurrency::CurrentScheduler::Get()->GetNumberOfVirtualProcessors());
227  parallel_quick_sort<T>(m, x, 0, m.rows - 1, 200, static_cast<int>(log2f(float(nCores))) + 4);
228 #else
229  sequential_quick_sort<T>(m, x, 0, m.rows - 1, 200);
230 #endif
231  }
232 
233  namespace {
234  template <typename T>
235  inline void deepSort(Mat &m, int depth, int begin, int end)
236  {
237  if (depth == m.cols) return; // we are too deep
238  if (begin == end) return; // do not sort one element
239 
240 #ifdef ENABLE_PPL
241  const int nCores = MAX(1, concurrency::CurrentScheduler::Get()->GetNumberOfVirtualProcessors());
242  parallel_quick_sort<T>(m, depth, begin, end, 200, static_cast<int>(log2f(float(nCores))) + 4);
243 #else
244  sequential_quick_sort<T>(m, depth, begin, end, 200);
245 #endif
246 
247  int ref_pos = begin;
248  T ref_val = m.at<T>(begin, depth);
249  for (int y = begin + 1; y <= end; y++) {
250  T val = m.at<T>(y, depth);
251  if (val != ref_val) {
252  deepSort<T>(m, depth + 1, ref_pos, y - 1);
253  ref_pos = y;
254  ref_val = val;
255  }
256  } // y
257  }
258  }
259 
267  template <typename T>
268  DllExport inline void sortRows(Mat &m)
269  {
270  deepSort<T>(m, 0, 0, m.rows - 1);
271  }
272 
273  // ------------------------------------------- SUFFLE ------------------------------------------
274  // ----------------------- fast random shuffle of Mat elements with PPL -----------------------
282  DllExport inline void shuffleRows(Mat &m)
283  {
284 #ifdef ENABLE_PPL
285  // int nCores = MAX(1, std::thread::hardware_concurrency());
286  int nCores = MAX(1, concurrency::CurrentScheduler::Get()->GetNumberOfVirtualProcessors());
287  int step = MAX(2, m.rows / (nCores * 10));
288  concurrency::parallel_for(0, m.rows, step, [step, &m](int S) {
289  Mat tmp;
290  int last = MIN(S + step, m.rows);
291  for (int s = last - 1; s > S; s--) { // s = [last - 1; S + 1]
292  dword r = DirectGraphicalModels::random::u<dword>(S, s); // r = [S; s] = [S; S + 1] -> [S; last - 1]
293  if (r != s) Swap(m.row(s), m.row(r), tmp);
294  }
295  });
296 #else
297  Mat tmp;
298  for (int s = m.rows - 1; s > 0; s--) { // s = [n-1; 1]
299  int r = random::u<int>(0, s); // r = [0; s] = [0; 1] -> [0; n-1]
300  if (r != s) Swap(lvalue_cast(m.row(s)), lvalue_cast(m.row(r)), tmp);
301  }
302 #endif
303  }
304 
305 } }
void shuffleRows(Mat &m)
Randomly shuffles the rows of the input matrix.
Definition: parallel.h:282
void gemm(const Mat &A, const Mat &B, float alpha, const Mat &C, float beta, Mat &res)
Fast generalized matrix multiplication.
Definition: parallel.h:112
void sortRows(Mat &m, int x)
Sorts the rows of the input matrix by the given dimension.
Definition: parallel.h:222