MorphoGraphX  2.0-1-227
DistMatrixCuda.hpp
Go to the documentation of this file.
1 //
2 // This file is part of MorphoGraphX - http://www.MorphoGraphX.org
3 // Copyright (C) 2012-2016 Richard S. Smith and collaborators.
4 //
5 // If you use MorphoGraphX in your work, please cite:
6 // http://dx.doi.org/10.7554/eLife.05864
7 //
8 // MorphoGraphX is free software, and is licensed under under the terms of the
9 // GNU General (GPL) Public License version 2.0, http://www.gnu.org/licenses.
10 //
11 #ifndef DIST_MATRIX_CUDA_HPP
12 #define DIST_MATRIX_CUDA_HPP
13 
14 #include <Cuda.hpp>
15 #include <ThrustTypes.hpp>
16 #include <CachedAlloc.hpp>
17 
18 #include <thrust/inner_product.h>
19 #include <thrust/extrema.h>
20 
21 namespace mgx
22 {
23  // Vector (or matrix)-scalar multiply
24  template<typename T>
25  struct multVSOP
26  {
27  const double a;
28 
29  multVSOP(double _a) : a(_a) {}
30 
31  __host__ __device__
32  T operator()(const T &x) const
33  {
34  return a * x;
35  }
36  };
37 
38  // Matrix-vector multiply
39  template<typename MatT, typename VecT>
40  struct multMVOP
41  {
42  uint *Nb;
43  MatT *Mv, *Me;
44  VecT *v;
45  VecT *r;
47 
48  // Types of deice vectors
49  typedef typename thrust::device_vector<MatT> DevMatV;
50  typedef typename thrust::device_vector<VecT> DevVecV;
51 
52  multMVOP(DeviceVu *_Nb, DevMatV *_Mv, DevMatV *_Me, DevVecV *_v, DevVecV *_r, uint _nbs)
53  : Nb(devP(_Nb)), Mv(devP(_Mv)), Me(devP(_Me)), v(devP(_v)), r(devP(_r)), nbs(_nbs) {}
54 
55  __host__ __device__
56  void operator()(const uint vtx) const
57  {
58  uint edg = vtx * nbs;
59  VecT sum = Mv[vtx] * v[vtx];
60 
61  // Sum over neighbors
62  for(uint i = 0; i < nbs; i++, edg++)
63  if(Nb[edg] != vtx)
64  sum += Me[edg] * v[Nb[edg]];
65 
66  r[vtx] = sum;
67  }
68  };
69 
70  // Add to diagonal
71  template <typename T>
72  struct addToDiagOP
73  {
74  T *Mv;
75  double a;
76  int sz;
77 
78  typedef typename thrust::device_vector<T> DevMatV;
79 
80  addToDiagOP(DevMatV *_Mv, double _a)
81  : Mv(devP(_Mv)), a(_a) {}
82 
83  __host__ __device__
84  void operator()(const uint vtx) const
85  {
86  T &m = Mv[vtx];
87 
88  for(int i = 0; i < T::numrows; ++i)
89  m[i][i] += a;
90  }
91  };
92 
93  // Jacobi pre-conditioner
94  template<typename T>
96  {
97  uint *Nb;
98  T *Mv, *Me;
99  T *Av, *Ae;
101 
102  jacobiPreCondOP(DeviceVu *_Nb, thrust::device_vector<T> *_Mv, thrust::device_vector<T> *_Me,
103  thrust::device_vector<T> *_Av, thrust::device_vector<T> *_Ae, uint _nbs)
104  : Nb(devP(_Nb)), Mv(devP(_Mv)), Me(devP(_Me)), Av(devP(_Av)), Ae(devP(_Ae)), nbs(_nbs) {}
105 
106  __host__ __device__
107  void operator()(const uint vtx) const
108  {
109  T &m = Mv[vtx];
110  const T &a = Av[vtx];
111  m = inverse(a);
112  }
113  };
114 
115  // Saxpy
116  template<typename T>
117  struct saxpyOP
118  {
119  const double a;
120  saxpyOP(double _a) : a(_a) {}
121 
122  __host__ __device__
123  T operator()(const T &x, const T &y) const
124  {
125  return a * x + y;
126  }
127  };
128 
129  // Sadly numeric_limits complains about calling host functions
130  template<typename T>
131  struct limits
132  {
133  __device__ __host__
134  T static min();
135  __device__ __host__
136  T static max();
137  };
138  template<> double limits<double>::min() { return DBL_MIN; }
139  template<> float limits<float>::min() { return FLT_MIN; }
140  template<> double limits<double>::max() { return DBL_MAX; }
141  template<> float limits<float>::max() { return FLT_MAX; }
142 
143  template<typename T>
144  struct minElementOP : public thrust::unary_function<T, typename T::value_type>
145  {
146  typedef typename T::value_type ValType;
147 
148  __host__ __device__
150  {
152  uint n = sizeof(T)/sizeof(ValType);
153  ValType *p = a.data();
154  for(uint i = 0; i < n; i++) {
155  if(min > *p)
156  min = *p;
157  ++p;
158  }
159 
160  return min;
161  }
162  };
163 
164  template<typename T>
165  struct maxElementOP : public thrust::unary_function<T, typename T::value_type>
166  {
167  typedef typename T::value_type ValType;
168 
169  __host__ __device__
171  {
173  uint n = sizeof(T)/sizeof(ValType);
174  ValType *p = a.data();
175  for(uint i = 0; i < n; i++) {
176  if(max < *p)
177  max = *p;
178  ++p;
179  }
180 
181  return max;
182  }
183  };
184 
185  // Component multiply
186  template<typename T>
187  struct multCompOP: public thrust::binary_function<T, T, T>
188  {
189  __host__ __device__
190 
191  T operator()(const T &x,const T &y)
192  {
193  T v;
194  for(uint i = 0; i < T::numElems; ++i)
195  v[i] = x[i] * y[i];
196 
197  return v;
198  }
199  };
200 
201  template<typename T>
202  struct addOP : public thrust::binary_function<T, T, T>
203  {
204  __host__ __device__
205 
206  T operator()(const T &x,const T &y)
207  {
208  T v = x + y;
209 
210  return v;
211  }
212  };
213 
214  //
215  // Distributed matrix GPU calls
216  //
217  template <typename T>
218  int multGPU(T *v, typename T::value_type::value_type a, T *r)
219  {
220  if(!r or !v)
221  return 1;
222 
223  thrust::transform(v->begin(), v->end(), r->begin(), multVSOP<typename T::value_type>(a));
224 
225  return 0;
226  }
227 
228  // Matrix or vector inner product
229  template<typename T>
230  int multGPU(T *v1, T *v2, typename T::value_type::value_type &r)
231  {
232  if(!v1 or !v2)
233  return 1;
234 
235  typename T::value_type init;
236  for(uint i = 0; i < T::value_type::numElems; ++i)
237  init[i] = 0;
238 
239 // Breaks in Cuda >7
240 #ifdef THRUST_BACKEND_CUDA
241  typename T::value_type res = thrust::inner_product(thrust::cuda::par(mgx::cachedAlloc), v1->begin(), v1->end(),
243 #else
244  typename T::value_type res = thrust::inner_product(v1->begin(), v1->end(),
246 #endif
247  r = 0;
248  for(uint i = 0; i < T::value_type::numElems; ++i)
249  r += res[i];
250 
251  return(0);
252  }
253 
254  // Matrix-vector multiply
255  template <typename TM, typename TV>
256  int multGPU(DeviceVu *nb, TM *mv, TM *me, TV *v, TV *r)
257  {
258  if(!nb or !r or !mv or !me or !v)
259  return 1;
260 
261  // Grab correctly typed device pointers
262  uint nbs = me->size()/mv->size();
263  uint n = nb->size()/nbs;
264 
265  thrust::counting_iterator<int, thrust::device_system_tag> first(0);
266  thrust::counting_iterator<int, thrust::device_system_tag> last(n);
267  thrust::for_each(first, last, multMVOP<typename TM::value_type, typename TV::value_type>(nb, mv, me, v, r, nbs));
268 
269  return(0);
270  }
271 
272  // Add to diagonal
273  template<typename T>
274  int addToDiagGPU(typename T::value_type::value_type a, T *r)
275  {
276  if(!r)
277  return 1;
278 
279  thrust::counting_iterator<int, thrust::device_system_tag> first(0);
280  thrust::counting_iterator<int, thrust::device_system_tag> last(r->size());
281  thrust::for_each(first, last, addToDiagOP<typename T::value_type>(r, a));
282 
283  return(0);
284  }
285 
286  // Matrix or vector addition
287  template<typename T>
288  int addGPU(T *v1, T *v2, T *r)
289  {
290  if(!r or !v1 or !v2)
291  return 1;
292 
293  thrust::transform(v1->begin(), v1->end(), v2->begin(), r->begin(), thrust::plus<typename T::value_type>());
294 
295  return(0);
296  }
297 
298  // Matrix or vector subtraction
299  template<typename T>
300  int subtractGPU(T *v1, T *v2, T *r)
301  {
302  if(!r or !v1 or !v2)
303  return 1;
304 
305  thrust::transform(v1->begin(), v1->end(), v2->begin(), r->begin(), thrust::minus<typename T::value_type>());
306 
307  return(0);
308  }
309 
310  // Perform saxpy r = a * v1 + v2
311  template<typename T>
312  int saxpyGPU(T *v1, T *v2, typename T::value_type::value_type a, T *r)
313  {
314  if(!v1 or !v2)
315  return 1;
316 
317  thrust::transform(v1->begin(), v1->end(), v2->begin(), r->begin(), saxpyOP<typename T::value_type>(a));
318 
319  return(0);
320  }
321 
322  // Fill matrix or vector with scalar
323  template<typename T>
324  int fillGPU(typename T::value_type::value_type a, T *r)
325  {
326  if(!r)
327  return 1;
328 
329  typename T::value_type fill(a);
330 
331  thrust::fill(r->begin(), r->end(), fill);
332 
333  return(0);
334  }
335 
336  // Find min value
337  template<typename T>
338  int minGPU(T *v, typename T::value_type::value_type &r)
339  {
340  if(!v)
341  return 1;
342 
343  thrust::device_vector<typename T::value_type::value_type> s(v->size());
344  thrust::transform(v->begin(), v->end(), s.begin(), minElementOP<typename T::value_type>());
345 
346 #ifdef THRUST_BACKEND_CUDA
347  r = *thrust::min_element(thrust::cuda::par(mgx::cachedAlloc), s.begin(), s.end());
348 #else
349  r = *thrust::min_element(s.begin(), s.end());
350 #endif
351 
352  return(0);
353  }
354 
355  // Find max value
356  template<typename T>
357  int maxGPU(T *v, typename T::value_type::value_type &r)
358  {
359  if(!v)
360  return 1;
361 
362  thrust::device_vector<typename T::value_type::value_type> s(v->size());
363  thrust::transform(v->begin(), v->end(), s.begin(), maxElementOP<typename T::value_type>());
364 
365 #ifdef THRUST_BACKEND_CUDA
366  r = *thrust::max_element(thrust::cuda::par(mgx::cachedAlloc), s.begin(), s.end());
367 #else
368  r = *thrust::max_element(s.begin(), s.end());
369 #endif
370 
371  return(0);
372  }
373 
374  // Jacobi preconditioner
375  template<typename T>
376  int jacobiPreCondGPU(DeviceVu *nb, T *mv, T *me, T *av, T *ae)
377  {
378  if(!nb or !mv or !me or !av or !ae)
379  return 1;
380 
381  // Grab correctly typed device pointers
382  uint nbs = me->size()/mv->size();
383  uint n = nb->size()/nbs;
384 
385  thrust::counting_iterator<int, thrust::device_system_tag> first(0);
386  thrust::counting_iterator<int, thrust::device_system_tag> last(n);
387  thrust::for_each(first, last, jacobiPreCondOP<typename T::value_type>(nb, mv, me, av, ae, nbs));
388 
389  return(0);
390  }
391 }
392 #endif
mgx::addOP
Definition: DistMatrixCuda.hpp:202
mgx::jacobiPreCondOP::Mv
T * Mv
Definition: DistMatrixCuda.hpp:98
mgx::uint
unsigned int uint
Definition: Geometry.hpp:41
mgx::DeviceVu
thrust::device_vector< uint > DeviceVu
Definition: ThrustTypes.hpp:58
mgx::multMVOP::DevMatV
thrust::device_vector< MatT > DevMatV
Definition: DistMatrixCuda.hpp:49
mgx::jacobiPreCondOP
Definition: DistMatrixCuda.hpp:95
mgx::minElementOP::operator()
__host__ __device__ ValType operator()(T &a)
Definition: DistMatrixCuda.hpp:149
mgx::addOP::operator()
__host__ __device__ T operator()(const T &x, const T &y)
Definition: DistMatrixCuda.hpp:206
mgx::addToDiagOP::addToDiagOP
addToDiagOP(DevMatV *_Mv, double _a)
Definition: DistMatrixCuda.hpp:80
mgx::saxpyOP
Definition: DistMatrixCuda.hpp:117
mgx::multVSOP::multVSOP
multVSOP(double _a)
Definition: DistMatrixCuda.hpp:29
mgx::maxGPU
int maxGPU(T *v, typename T::value_type::value_type &r)
Definition: DistMatrixCuda.hpp:357
mgx::multMVOP
Definition: DistMatrixCuda.hpp:40
mgx::multVSOP
Definition: DistMatrixCuda.hpp:25
mgx::addToDiagOP
Definition: DistMatrixCuda.hpp:72
mgx::jacobiPreCondOP::Av
T * Av
Definition: DistMatrixCuda.hpp:99
mgx::limits::max
__device__ __host__ static T max()
mgx::addToDiagOP::DevMatV
thrust::device_vector< T > DevMatV
Definition: DistMatrixCuda.hpp:78
mgx::addToDiagGPU
int addToDiagGPU(typename T::value_type::value_type a, T *r)
Definition: DistMatrixCuda.hpp:274
n
#define n
Definition: Eigenvalues.hpp:36
mgx::maxElementOP::operator()
__host__ __device__ ValType operator()(T &a)
Definition: DistMatrixCuda.hpp:170
mgx::multVSOP::a
const double a
Definition: DistMatrixCuda.hpp:27
mgx::saxpyGPU
int saxpyGPU(T *v1, T *v2, typename T::value_type::value_type a, T *r)
Definition: DistMatrixCuda.hpp:312
mgx::jacobiPreCondOP::nbs
uint nbs
Definition: DistMatrixCuda.hpp:100
mgx::saxpyOP::a
const double a
Definition: DistMatrixCuda.hpp:119
mgx::multCompOP::operator()
__host__ __device__ T operator()(const T &x, const T &y)
Definition: DistMatrixCuda.hpp:191
mgx::minElementOP
Definition: DistMatrixCuda.hpp:144
mgx::multVSOP::operator()
__host__ __device__ T operator()(const T &x) const
Definition: DistMatrixCuda.hpp:32
mgx::multMVOP::operator()
__host__ __device__ void operator()(const uint vtx) const
Definition: DistMatrixCuda.hpp:56
mgx::inverse
CU_HOST_DEVICE Matrix< 1, 1, T > inverse(const Matrix< 1, 1, T > &mat)
Definition: Matrix.hpp:1001
ThrustTypes.hpp
mgx::multMVOP::Nb
uint * Nb
Definition: DistMatrixCuda.hpp:42
mgx::Information::init
mgx_EXPORT void init(QMainWindow *wnd)
mgx
Distributed matrix library.
Definition: Assert.hpp:26
mgx::multMVOP::multMVOP
multMVOP(DeviceVu *_Nb, DevMatV *_Mv, DevMatV *_Me, DevVecV *_v, DevVecV *_r, uint _nbs)
Definition: DistMatrixCuda.hpp:52
mgx::minElementOP::ValType
T::value_type ValType
Definition: DistMatrixCuda.hpp:146
mgx::saxpyOP::operator()
__host__ __device__ T operator()(const T &x, const T &y) const
Definition: DistMatrixCuda.hpp:123
mgx::multMVOP::r
VecT * r
Definition: DistMatrixCuda.hpp:45
mgx::maxElementOP::ValType
T::value_type ValType
Definition: DistMatrixCuda.hpp:167
Cuda.hpp
mgx::minGPU
int minGPU(T *v, typename T::value_type::value_type &r)
Definition: DistMatrixCuda.hpp:338
mgx::devP
T * devP(thrust::device_vector< T > &DVec)
Definition: Cuda.hpp:81
mgx::max
T CU_HOST_DEVICE max(const T a, const T b)
Definition: Util.hpp:34
mgx::multMVOP::Mv
MatT * Mv
Definition: DistMatrixCuda.hpp:43
mgx::multMVOP::DevVecV
thrust::device_vector< VecT > DevVecV
Definition: DistMatrixCuda.hpp:50
mgx::limits
Definition: DistMatrixCuda.hpp:131
mgx::jacobiPreCondOP::operator()
__host__ __device__ void operator()(const uint vtx) const
Definition: DistMatrixCuda.hpp:107
mgx::limits::min
__device__ __host__ static T min()
mgx::multGPU
int multGPU(T *v, typename T::value_type::value_type a, T *r)
Definition: DistMatrixCuda.hpp:218
mgx::jacobiPreCondGPU
int jacobiPreCondGPU(DeviceVu *nb, T *mv, T *me, T *av, T *ae)
Definition: DistMatrixCuda.hpp:376
mgx::addToDiagOP::a
double a
Definition: DistMatrixCuda.hpp:75
mgx::fillGPU
int fillGPU(typename T::value_type::value_type a, T *r)
Definition: DistMatrixCuda.hpp:324
mgx::addToDiagOP::Mv
T * Mv
Definition: DistMatrixCuda.hpp:74
mgx::addToDiagOP::sz
int sz
Definition: DistMatrixCuda.hpp:76
mgx::jacobiPreCondOP::Me
T * Me
Definition: DistMatrixCuda.hpp:98
mgx::maxElementOP
Definition: DistMatrixCuda.hpp:165
mgx::min
CU_HOST_DEVICE T min(const T a, const T b)
Definition: Util.hpp:26
mgx::multCompOP
Definition: DistMatrixCuda.hpp:187
mgx::multMVOP::nbs
uint nbs
Definition: DistMatrixCuda.hpp:46
mgx::jacobiPreCondOP::jacobiPreCondOP
jacobiPreCondOP(DeviceVu *_Nb, thrust::device_vector< T > *_Mv, thrust::device_vector< T > *_Me, thrust::device_vector< T > *_Av, thrust::device_vector< T > *_Ae, uint _nbs)
Definition: DistMatrixCuda.hpp:102
mgx::jacobiPreCondOP::Nb
uint * Nb
Definition: DistMatrixCuda.hpp:97
mgx::addGPU
int addGPU(T *v1, T *v2, T *r)
Definition: DistMatrixCuda.hpp:288
mgx::multMVOP::Me
MatT * Me
Definition: DistMatrixCuda.hpp:43
mgx::saxpyOP::saxpyOP
saxpyOP(double _a)
Definition: DistMatrixCuda.hpp:120
mgx::addToDiagOP::operator()
__host__ __device__ void operator()(const uint vtx) const
Definition: DistMatrixCuda.hpp:84
mgx::jacobiPreCondOP::Ae
T * Ae
Definition: DistMatrixCuda.hpp:99
mgx::multMVOP::v
VecT * v
Definition: DistMatrixCuda.hpp:44
CachedAlloc.hpp
mgx::subtractGPU
int subtractGPU(T *v1, T *v2, T *r)
Definition: DistMatrixCuda.hpp:300