MorphoGraphX  2.0-1-227
ImageCuda.hpp
Go to the documentation of this file.
1 //
2 // This file is part of MorphoGraphX - http://www.MorphoGraphX.org
3 // Copyright (C) 2012-2016 Richard S. Smith and collaborators.
4 //
5 // If you use MorphoGraphX in your work, please cite:
6 // http://dx.doi.org/10.7554/eLife.05864
7 //
8 // MorphoGraphX is free software, and is licensed under under the terms of the
9 // GNU General (GPL) Public License version 2.0, http://www.gnu.org/licenses.
10 //
11 #ifndef IMAGE_CUDA_HPP
12 #define IMAGE_CUDA_HPP
13 
14 #include <Cuda.hpp>
15 
16 namespace mgx
17 {
18  // Find a 1D Gaussian mask (normalized to sum up to 1)
19  inline bool getGaussianMask1D(const float step, const float sigma, HostVf& mask, int& radius)
20  {
21  // Find mask
22  float sigmaSqr = sigma * sigma;
23  float rad = .5 + 3.0 * sigma / step;
24  radius = (rad <= 0 ? 0 : int(rad));
25  if(radius <= 0)
26  return false;
27 
28  float normalize = 0;
29  mask.resize(2 * radius + 1);
30  int j = 0;
31  for(int u = -radius; u <= radius; ++u) {
32  float x = u * step;
33  double alpha = exp(-(x * x) / (2.0 * sigmaSqr));
34  float val = alpha / (2.0 * M_PI * sigma);
35  mask[j++] = val;
36  normalize += val;
37  }
38  if(normalize <= 0)
39  return false;
40  for(size_t i = 0; i < mask.size(); ++i)
41  mask[i] /= normalize;
42 
43  return true;
44  }
45 
46  // Create a mask for a ellipsoid neighborhood
47  inline bool getRoundMask3D(const Point3i &radius, HostVf &mask)
48  {
49  Point3i size = radius * 2 + Point3i(1,1,1);
50  mask.resize(size.x() * size.y() * size.z());
51 
52  // Setup mask (we could antialias here)
53  for(int z = 0; z < size.z(); ++z)
54  for(int y = 0; y < size.y(); ++y)
55  for(int x = 0; x < size.x(); ++x) {
56  float xdist = float(x - radius.x())/float(radius.x());
57  float ydist = float(y - radius.y())/float(radius.y());
58  float zdist = float(z - radius.z())/float(radius.z());
59  int offset = getOffset(x, y, z, size);
60  if(sqrt(xdist*xdist + ydist*ydist + zdist*zdist) <= 1.0)
61  mask[offset] = 1;
62  else
63  mask[offset] = 0;
64  }
65  return true;
66  }
67 
68  // Normalize 1D Mask
69  inline void normalizeMaskScale(HostVf& mask)
70  {
71  if(mask.empty())
72  return;
73 
74  float normalize = 0;
75  for(size_t i = 0; i < mask.size(); ++i)
76  normalize += mask[i];
77 
78  if(normalize > 0)
79  for(size_t i = 0; i < mask.size(); ++i)
80  mask[i] /= normalize;
81  }
82 
83 // // Go from image coordinates to world coordinates (centers of voxels)
84 // __device__ __host__ inline
85 // Point3f imageToWorld(const Point3i& img, const Point3f& step, const Point3f& shift)
86 // {
87 // return multiply(Point3f(img) + Point3f(.5f, .5f, .5f), step) + shift;
88 // }
89 //
90 // // Go from world coordinates to image coordinates
91 // __device__ __host__ inline
92 // Point3i worldToImage(const Point3f& wrld, const Point3f& step, const Point3f& shift)
93 // {
94 // return Point3i((wrld - shift) / step - Point3f(.5f, .5f, .5f));
95 // }
96 //
97 // // Compute offset in stack
98 // __device__ __host__ inline size_t offset(uint x, uint y, uint z, uint xsz, uint ysz)
99 // {
100 // return ((size_t(z) * ysz + y) * xsz + x);
101 // }
102 //
103 // __device__ __host__ inline
104 // size_t getOffset(const int x, const int y, const int z, const Point3i& size)
105 // {
106 // return ((size_t(z) * size.y() + y) * size.x() + x);
107 // }
108 //
109 // __device__ __host__ inline size_t getOffset(const Point3i p, const Point3i& size)
110 // {
111 // return ((size_t(p.z()) * size.y() + p.y()) * size.x() + p.x());
112 // }
113  __device__ __host__ inline void getXY(uint idx, const Point3i& size, int& x, int& y)
114  {
115  x = idx % size.x();
116  y = idx / size.x();
117  }
118 
119  __device__ __host__ inline void getXZ(uint idx, const Point3i& size, int& x, int& z)
120  {
121  x = idx % size.x();
122  z = idx / size.x();
123  }
124 
125  __device__ __host__ inline void getYZ(uint idx, const Point3i& size, int& y, int& z)
126  {
127  y = idx % size.y();
128  z = idx / size.y();
129  }
130 
131  __device__ __host__ inline void getXYZ(uint idx, const Point3i& size, int& x, int& y, int& z)
132  {
133  x = idx % size.x();
134  y = idx / size.x() % size.y();
135  z = idx / size.x() / size.y();
136  }
137 
138  // Compute stride
139  __device__ __host__ inline Point3i getStride(const Point3i& size)
140  {
141  return (Point3i(1, size.x(), size.x() * size.y()));
142  }
143 
144  // Clip point to clipping planes
145  __device__ __host__ inline
146  bool clipPoint(const Point3f& wp, const Point3u& clipDo, const Point4f* pn)
147  {
148  bool clip = false;
149  Point4f p4 = Point4f(wp.x(), wp.y(), wp.z(), 1.0);
150  for(int i = 0; i < 3; i++)
151  if(clipDo[i]) {
152  // Test if inside clipping planes
153  for(int k = 0; k < 2; k++)
154  if(pn[i * 2 + k] * p4 < 0)
155  clip = true;
156  }
157  return (clip);
158  }
159 
160  // --- Cuda kernels ---
161 
162  // Average
164  {
165  int acc;
166  int count;
167 
168  __device__ __host__ void init() { acc = count = 0; }
169 
170  __device__ __host__ void data(int val, int) { acc += val; count++; }
171 
172  __device__ __host__ ushort result() { return ushort(acc / count); }
173  };
174 
175  // Median
177  {
178  static const int MaxMedianMask = 7*7*7;
180  int count;
181 
182  __device__ __host__ void init() { count = 0; }
183 
184  __device__ __host__ void data(int val, int) { array[count++] = val; }
185 
186  // quickselect
187  __device__ __host__ ushort select(ushort rank)
188  {
189  ushort low = 0, high = count - 1;
190  int tmpVal;
191 #define SWAP(x,y) { tmpVal = (x); (x) = (y); (y) = tmpVal; }
192  while(true)
193  {
194  if(high <= low) return array[rank]; // catches rank == pivot (below)
195  if(high == low + 1)
196  {
197  if(array[low] > array[high]) SWAP(array[low],array[high]);
198  return array[rank];
199  }
200 
201  // Middle is halfway between low and high
202  // Sort the three elements in low, middle, and high
203  // Put the medium one out of the way in low and the lowest in low+1
204  ushort middle = (low + high) / 2;
205  if(array[middle] > array[high]) SWAP(array[middle],array[high]);
206  if(array[low] > array[high]) SWAP(array[low],array[high]);
207  if(array[middle] > array[low]) SWAP(array[low],array[middle]);
208  SWAP(array[middle],array[low+1]);
209 
210  // Pivot around median element (currently in low)
211  ushort ll = low+1, hh = high;
212  while(true)
213  {
214  do ll++; while (array[low] > array[ll]);
215  do hh--; while (array[hh] > array[low]);
216  if(hh < ll) break;
217  SWAP(array[ll],array[hh]);
218  }
219 
220  // Swap median back to the correct position
221  SWAP(array[low],array[hh]);
222 
223  // Shift low and high as needed
224  if(rank <= hh) high = hh-1;
225  if(rank >= hh) low = ll;
226  }
227 #undef SWAP
228  }
229 
230  __device__ __host__ ushort result() { return select(count / 2); }
231  };
232 
233  // Dilate
235  {
236  int acc;
237 
238  __device__ __host__ void init() { acc = 0; }
239 
240  __device__ __host__ void data(int val, int)
241  {
242  if(val > acc)
243  acc = val;
244  }
245 
246  __device__ __host__ ushort result() { return ushort(acc); }
247  };
248 
249  // Dilate with generic mask
251  {
252  float acc;
253  const float* mask;
254 
255  DilateMaskKernel(const float* _mask) : mask(_mask) {}
256 
257  __device__ __host__ void init() { acc = 0; }
258 
259  __device__ __host__ void data(int val, int pos)
260  {
261  if(mask[pos] > 0 and val > acc)
262  acc = val;
263  }
264 
265  __device__ __host__ ushort result() { return ushort(acc); }
266  };
267 
268  // Erode
270  {
271  int acc;
272 
273  __device__ __host__ void init() { acc = -1; }
274 
275  __device__ __host__ void data(int val, int)
276  {
277  if(acc < 0)
278  acc = val;
279  else if(val != acc)
280  acc = 0;
281  }
282 
283  __device__ __host__ ushort result() { return ushort(acc); }
284  };
285 
286  // Erode
287  struct ErodeKernel
288  {
289  int acc;
290 
291  __device__ __host__ void init() { acc = 65536; }
292 
293  __device__ __host__ void data(int val, int)
294  {
295  if(val < acc)
296  acc = val;
297  }
298 
299  __device__ __host__ ushort result() { return ushort(acc); }
300  };
301 
302  // Erode with generic mask
304  {
305  float acc;
306  const float* mask;
307 
308  ErodeMaskKernel(const float* _mask) : mask(_mask) {}
309 
310  __device__ __host__ void init() { acc = 65536; }
311 
312  __device__ __host__ void data(int val, int pos)
313  {
314  if(mask[pos] > 0 and val < acc)
315  acc = val;
316  }
317 
318  __device__ __host__ ushort result() { return ushort(acc); }
319  };
320 
321  // Erode with mask (used for spherical neighborhood)
322  template <typename T>
323  struct MaskKernel
324  {
325  float acc;
326  const float* mask;
327 
328  MaskKernel(const float* _mask) : mask(_mask) {}
329 
330  __device__ __host__ void init() { acc = 0; }
331 
332  __device__ __host__ void data(T val, int pos) { acc += val * mask[pos]; }
333 
334  __device__ __host__ T result()
335  {
336  // Trim result
337  if(acc <= std::numeric_limits<T>::lowest())
338  return std::numeric_limits<T>::lowest();
339  else if(acc >= std::numeric_limits<T>::max())
341  else if(std::numeric_limits<T>::is_integer)
342  return T(acc + 0.5); // Round integer types
343  else
344  return T(acc);
345  }
346  };
347 
348  // Separable operation data along X axis
349  template <typename opT, typename T> struct sepIterX
350  {
351  opT pixOP;
352  int radius;
353 
356  T* Src;
357  T* Dst;
358 
359  sepIterX(opT _pixOP, int _radius,
360  thrust::device_vector<T> &_Src, thrust::device_vector<T> &_Dst)
361  : pixOP(_pixOP), radius(_radius), Src(devP(_Src)), Dst(devP(_Dst)) {}
362 
363  void setSizes(const Point3i&, const Point3i& _size)
364  {
365  size = _size;
366  stride = getStride(size);
367  }
368 
369  __device__ __host__ void operator()(uint idx)
370  {
371  opT pxOP = pixOP;
372  int x, y;
373  getXY(idx, size, x, y);
374 
375  int xs = x - radius, xe = x + radius + 1;
376  if(xs < 0)
377  xs = 0;
378  if(xe > size.x())
379  xe = size.x();
380  size_t zIdx = getOffset(x, y, 0, size);
381  int maskinit = radius - (x - xs); // Calculate index into mask
382 
383  for(int z = 0; z < size.z(); ++z, zIdx += stride.z()) {
384  pxOP.init();
385  size_t xIdx = getOffset(xs, y, z, size);
386  int maskpos = maskinit;
387  for(int xi = xs; xi < xe; ++xi, xIdx += stride.x())
388  pxOP.data(Src[xIdx], maskpos++);
389  Dst[zIdx] = pxOP.result();
390  }
391  }
392  };
393 
394  // Separable operation along Y axis
395  template <typename opT, typename T> struct sepIterY
396  {
397  opT pixOP;
398  int radius;
399 
402  T* Src;
403  T* Dst;
404 
405  sepIterY(opT _pixOP, int _radius,
406  thrust::device_vector<T>& _Src, thrust::device_vector<T>& _Dst)
407  : pixOP(_pixOP), radius(_radius), Src(devP(_Src)), Dst(devP(_Dst)) {}
408 
409  void setSizes(const Point3i&, const Point3i& _size)
410  {
411  size = _size;
412  stride = getStride(size);
413  }
414 
415  __device__ __host__ void operator()(uint idx)
416  {
417  opT pxOP = pixOP;
418  int x, y;
419  getXY(idx, size, x, y);
420 
421  int ys = y - radius, ye = y + radius + 1;
422  if(ys < 0)
423  ys = 0;
424  if(ye > size.y())
425  ye = size.y();
426  size_t zIdx = getOffset(x, y, 0, size);
427  int maskinit = radius - (y - ys); // Calculate index into mask
428 
429  for(int z = 0; z < size.z(); ++z, zIdx += stride.z()) {
430  pxOP.init();
431  int maskpos = maskinit;
432  size_t yIdx = getOffset(x, ys, z, size);
433  for(int yi = ys; yi < ye; ++yi, yIdx += stride.y())
434  pxOP.data(Src[yIdx], maskpos++);
435  Dst[zIdx] = pxOP.result();
436  }
437  }
438  };
439 
440  // Separable operation along Z axis with XY-slices
441  template <typename opT, typename T> struct sepIterZ
442  {
443  opT pixOP;
444  int radius;
445 
448  T* Src;
449  T* Dst;
450 
451  sepIterZ(opT _pixOP, int _radius,
452  thrust::device_vector<T>& _Src, thrust::device_vector<T>& _Dst)
453  : pixOP(_pixOP), radius(_radius), Src(devP(_Src)), Dst(devP(_Dst)) {}
454 
455  void setSizes(const Point3i&, const Point3i& _size)
456  {
457  size = _size;
458  stride = getStride(size);
459  }
460 
461  __device__ __host__ void operator()(uint idx)
462  {
463  opT pxOP = pixOP;
464  int x, y;
465  getXY(idx, size, x, y);
466 
467  size_t zOff = getOffset(x, y, 0, size);
468  for(int z = 0; z < size.z(); ++z, zOff += stride.z()) {
469  int zs = z - radius, ze = z + radius + 1;
470  if(zs < 0)
471  zs = 0;
472  if(ze > size.z())
473  ze = size.z();
474 
475  pxOP.init();
476  int maskpos = radius - (z - zs); // Calculate index into mask
477 
478  size_t zIdx = getOffset(x, y, zs, size);
479  for(int zi = zs; zi < ze; ++zi, zIdx += stride.z())
480  pxOP.data(Src[zIdx], maskpos++);
481  Dst[zOff] = pxOP.result();
482  }
483  }
484  };
485 
486  // Flip an image
487  template <typename T> void flipImage(DimEnum dim, Point3i imgSize,
488  Point3i& flipImgSize, const T& src, T& dst)
489  {
490  if(dim == XY) {
491  flipImgSize = Point3i(imgSize.y(), imgSize.x(), imgSize.z());
492  size_t idx = 0;
493  for(int z = 0; z < imgSize.z(); ++z)
494  for(int y = 0; y < imgSize.y(); ++y)
495  for(int x = 0; x < imgSize.x(); ++x)
496  dst[getOffset(y, x, z, flipImgSize)] = src[idx++];
497  } else if(dim == YZ) {
498  flipImgSize = Point3i(imgSize.x(), imgSize.z(), imgSize.y());
499  size_t idx = 0;
500  for(int z = 0; z < imgSize.z(); ++z)
501  for(int y = 0; y < imgSize.y(); ++y)
502  for(int x = 0; x < imgSize.x(); ++x)
503  dst[getOffset(x, z, y, flipImgSize)] = src[idx++];
504  } else if(dim == XZ) {
505  flipImgSize = Point3i(imgSize.z(), imgSize.y(), imgSize.x());
506  size_t idx = 0;
507  for(int z = 0; z < imgSize.z(); ++z)
508  for(int y = 0; y < imgSize.y(); ++y)
509  for(int x = 0; x < imgSize.x(); ++x)
510  dst[getOffset(z, y, x, flipImgSize)] = src[idx++];
511  } else
512  std::cout << "Error in flipImage: unknown dimension" << std::endl;
513  }
514 
515  // Calculate threads
516  inline uint getTotalThreads(DimEnum dim, const Point3i size)
517  {
518  switch(dim) {
519  case X:
520  return size.x();
521  case Y:
522  return size.y();
523  case Z:
524  return size.z();
525  case XY:
526  return size.x() * size.y();
527  case XZ:
528  return size.x() * size.z();
529  case YZ:
530  return size.y() * size.z();
531  case XYZ:
532  size_t threads = size_t(size.x()) * size.y() * size.z();
533  if(threads > UINT_MAX)
534  throw(std::string("Max threads greater than unsigned integer capacity, reduce holdmem"));
535  return threads;
536  }
537  return 0;
538  }
539 
540  // Check bounds
541  inline int checkBounds(const Point3i& imgSize, const Point3i& base, const Point3i& size)
542  {
543  if(base.x() < 0 || base.y() < 0 || base.z() < 0)
544  return errMsg("checkbounds:base < 0");
545  if(size.x() <= 0 || size.y() <= 0 || size.z() <= 0)
546  return errMsg("checkbounds:size = 0");
547  if(base.x() + size.x() > imgSize.x() || base.y() + size.y() > imgSize.y() ||
548  base.z() + size.z() > imgSize.z())
549  return errMsg("checkbounds:size > image size");
550 
551  return (0);
552  }
553 
554  // Divide stack into sections to limit memory usage.
555  inline uint getStep(uint pos, uint maxpos, size_t stride, size_t szof, size_t mem)
556  {
557  if(pos >= maxpos)
558  return (0);
559 
560  // Check parms
561  if(stride * szof == 0)
562  throw(std::string("getStep: Bad stride (=0)"));
563 
564  // See how many slices can be done, do in chunks to limit memory usage.
565  uint step = mem / (stride * szof);
566  if(step == 0)
567  throw(std::string("getStep: Can't process enough data, try increasing CudaHoldMem"));
568 
569  if(pos + step > maxpos)
570  step = maxpos - pos;
571 
572  return (step);
573  }
574 
575  // Class to process generic command using XY blocks with padding
576  class processOP
577  {
578  public:
579  processOP(Point3i _imgSize, int _pad, size_t _sizeOf)
580  : sizeOf(_sizeOf), imgSize(_imgSize), pad(_pad), baseZ(0)
581  {
582  mem = userMem(); // Set at start and don't change
583  calcSize();
584  passesReq = ceil(float(imgSize.z()) / sizeZ);
585  if(passesReq > 1)
586  std::cout << "Blocking data to GPU, "
587  << int(1.1 * sizeOf * imgSize.x() * imgSize.y() * imgSize.z() / (1024 * 1024))
588  << " MB required, " << mem / MByte << " MB available ("
589  << sizeZ << " slices)" << std::endl;
590  }
591 
592  template <typename T> void alloc(T& Dvec)
593  {
594  // Setup device storage
595  size_t size = size_t(padSizeZ) * imgSize.x() * imgSize.y();
596  if(Dvec.size() != size) {
597  Dvec.resize(size);
598  checkCudaError("processOp.alloc:Error allocating device vector.");
599  }
600  }
601 
602  template <typename HT, typename DT> void write(const HT& Hvec, DT& Dvec)
603  {
604  // Copy source data to device
605  alloc(Dvec);
606  thrust::copy(Hvec.begin() + size_t(padBaseZ) * imgSize.x() * imgSize.y(),
607  Hvec.begin() + size_t(padBaseZ + padSizeZ) * imgSize.x() * imgSize.y(), Dvec.begin());
608  checkCudaError("processOp.write:Error copying data to device.");
609  }
610 
611  // Launch threads to process a command that has the device vectors already allocated
612  template <typename T> bool launch(T op, DimEnum dim)
613  {
614  // Set up operator pointers to data
615  Point3i base(0, 0, padBaseZ);
616  op.setSizes(base, Point3i(imgSize.x(), imgSize.y(), padSizeZ));
617 
618  uint maxThreads = 0;
619  uint totThreads = getTotalThreads(dim, Point3i(imgSize.x(), imgSize.y(), padSizeZ));
620  for(uint threadPos = 0; threadPos < totThreads; ) {
621  // Calculate threads
622  uint threads = getThreadCount(threadPos, totThreads);
623  if(maxThreads < threads)
624  maxThreads = threads;
625  // Launch threads
626  DCountIter first(threadPos), last(threadPos + threads);
627  thrust::for_each(first, last, op);
628  checkCudaError("processOp.launchDeviceOP:launch failure.");
629  if(!progressAdvance())
630  return errMsg("Operation canceled");
631  threadPos += threads;
632  }
633  return false;
634  }
635 
636  template <typename HT, typename DT> void read(const DT& Dvec, HT& Hvec)
637  {
638  // Copy results back to host, only works if x and y are not padded.
639  thrust::copy(Dvec.begin() + size_t(baseZ - padBaseZ) * imgSize.x() * imgSize.y(),
640  Dvec.begin() + size_t(baseZ - padBaseZ + sizeZ) * imgSize.x() * imgSize.y(),
641  Hvec.begin() + size_t(baseZ) * imgSize.x() * imgSize.y());
642  checkCudaError("processOp.read:Error copying data to host.");
643  }
644 
645  int next()
646  {
647  baseZ += sizeZ;
648  calcSize();
649  return imgSize.z() - baseZ;
650  }
651 
652  void rewind()
653  {
654  baseZ = 0;
655  calcSize();
656  }
657 
658  int passes() { return passesReq; }
659 
660  private:
661  int calcSize()
662  {
663  // Finished, nothing to do
664  if(baseZ >= imgSize.z())
665  return 0;
666 
667  // Calculate padding at the base
668  padBaseZ = baseZ;
669  padBaseZ -= pad;
670  if(padBaseZ < 0)
671  padBaseZ = 0;
672 
673  // Calcaulate how much we can do.
674  padSizeZ = getStep(padBaseZ, imgSize.z(), imgSize.x() * imgSize.y(), sizeOf, mem);
675 
676  // Adjust size for padding
677  sizeZ = padSizeZ;
678  if(padBaseZ + padSizeZ < imgSize.z())
679  sizeZ -= pad;
680  sizeZ -= baseZ - padBaseZ;
681 
682  // Not enough memory for one pass
683  if(sizeZ <= 0)
684  throw(std::string("processOP.calcSize: Can't process enough data, "
685  "try increasing cudaHoldMem"));
686 
687  return sizeZ;
688  }
689 
690  size_t sizeOf;
691  size_t mem;
692  Point3i imgSize;
693  int pad;
694  int baseZ;
695  int sizeZ;
696  int padBaseZ;
697  int padSizeZ;
698  int passesReq;
699  };
700 
701  // Process separable operators
702  template <typename opT, typename T>
703  int processSepOP(opT opx, opT opy, opT opz, const Point3i& imgSize, const Point3i& radius,
704  const thrust::host_vector<T>& srcData, thrust::host_vector<T>& dstData)
705  {
706  if(radius.x() < 0 && radius.y() < 0 && radius.z() < 0)
707  throw(std::string("processSepOP: At least one radius must be >= 0"));
708 
709  if(radius.x() == 0 && radius.y() == 0 && radius.z() == 0) {
710  dstData = srcData;
711  return 0;
712  }
713  thrust::host_vector<T> Hsrc(srcData);
714 
715  // Loop over sections of XY slices
716  processOP proc(imgSize, radius.z(), sizeof(T) * 2);
717  thrust::device_vector<T> DVec1, DVec2;
718  thrust::device_vector<T> *Dsrc = &DVec1, *Ddst = &DVec2;
719  do {
720  proc.write(Hsrc, *Dsrc);
721  proc.alloc(DVec2);
722 
723  if(radius.x() > 0) {
724  if(proc.launch(sepIterX<opT, T>(opx, radius.x(), *Dsrc, *Ddst), XY))
725  return 1;
726  std::swap(Dsrc, Ddst);
727  }
728  if(radius.y() > 0) {
729  if(proc.launch(sepIterY<opT, T>(opy, radius.y(), *Dsrc, *Ddst), XY))
730  return 1;
731  std::swap(Dsrc, Ddst);
732  }
733  if(radius.z() > 0) {
734  if(proc.launch(sepIterZ<opT, T>(opz, radius.z(), *Dsrc, *Ddst), XY))
735  return 1;
736  std::swap(Dsrc, Ddst);
737  }
738 
739  proc.read(*Dsrc, dstData);
740  } while(proc.next());
741 
742  return (0);
743  }
744 
745  // Non separable operation, calculates a column (in z) of voxels
746  template <typename opT, typename T>
747  struct nonSepIter
748  {
749  opT pixOP;
752  T *Src;
753  T *Dst;
754 
755  nonSepIter(opT _pixOP, Point3i _radius,
756  thrust::device_vector<T> &_Src, thrust::device_vector<T> &_Dst)
757  : pixOP(_pixOP), radius(_radius), Src(devP(_Src)), Dst(devP(_Dst)) {}
758 
759  void setSizes(const Point3i _base, const Point3i &_imgSize)
760  {
761  imgSize = _imgSize;
762  }
763 
764  __device__ __host__
765  void operator()(uint idx)
766  {
767  opT pxOP = pixOP;
768  int x, y;
769  getXY(idx, imgSize, x, y);
770 
771  // Calculate x and y neighborhood range
772  int xs = x - radius.x(), xe = x + radius.x() + 1;
773  if(xs < 0) xs = 0;
774  if(xe > imgSize.x()) xe = imgSize.x();
775 
776  int ys = y - radius.y(), ye = y + radius.y() + 1;
777  if(ys < 0) ys = 0;
778  if(ye > imgSize.y()) ye = imgSize.y();
779 
780  // Index of first voxel to calculate
781  Point3i maskSize = radius * 2 + Point3i(1,1,1);
782 
783  // Do a column of voxels in z
784  for(int z = 0; z < imgSize.z(); ++z) {
785  pxOP.init();
786 
787  // Calculate x and y neighborhood range
788  int zs = z - radius.z(), ze = z + radius.z() + 1;
789  if(zs < 0) zs = 0;
790  if(ze > imgSize.z()) ze = imgSize.z();
791 
792  for(int zi = zs; zi < ze; ++zi)
793  for(int yi = ys; yi < ye; ++yi)
794  for(int xi = xs; xi < xe; ++xi)
795  pxOP.data(Src[getOffset(xi,yi,zi,imgSize)], getOffset(xi-xs,yi-ys,zi-zs,maskSize));
796 
797  Dst[getOffset(x,y,z,imgSize)] = pxOP.result();
798  }
799  }
800  };
801 
802  // Process separable operators
803  template <typename opT, typename T>
804  int processNonSepOP(opT op, const Point3i &imgSize, const Point3i &radius,
805  const thrust::host_vector<T> &srcData, thrust::host_vector<T> &dstData)
806  {
807  if(radius.x() < 0 && radius.y() < 0 && radius.z() < 0)
808  throw(std::string("processSepOP: At least one radius must be >= 0"));
809 
810  if(radius.x() == 0 && radius.y() == 0 && radius.z() == 0) {
811  dstData = srcData;
812  return 0;
813  }
814  thrust::host_vector<T> Hsrc(srcData);
815 
816  // Loop over sections of XY slices
817  processOP proc(imgSize, radius.z(), sizeof(T) * 2);
818  thrust::device_vector<T> DVec1, DVec2;
819  thrust::device_vector<T>* Dsrc = &DVec1, *Ddst = &DVec2;
820  do {
821  proc.write(Hsrc, *Dsrc);
822  proc.alloc(DVec2);
823 
824  if(proc.launch(nonSepIter<opT, T>(op, radius, *Dsrc, *Ddst), XY))
825  return 1;
826  std::swap(Dsrc, Ddst);
827 
828  proc.read(*Dsrc, dstData);
829  } while(proc.next());
830 
831  return (0);
832  }
833 }
834 #endif
mgx::ErodeLabelsKernel::data
__device__ __host__ void data(int val, int)
Definition: ImageCuda.hpp:275
mgx::AverageKernel::data
__device__ __host__ void data(int val, int)
Definition: ImageCuda.hpp:170
mgx::MaskKernel::MaskKernel
MaskKernel(const float *_mask)
Definition: ImageCuda.hpp:328
mgx::DilateKernel::data
__device__ __host__ void data(int val, int)
Definition: ImageCuda.hpp:240
mgx::MaskKernel
Definition: ImageCuda.hpp:323
mgx::getRoundMask3D
bool getRoundMask3D(const Point3i &radius, HostVf &mask)
Definition: ImageCuda.hpp:47
mgx::ErodeKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:291
mgx::Y
@ Y
Definition: Cuda.hpp:71
mgx::uint
unsigned int uint
Definition: Geometry.hpp:41
mgx::nonSepIter::Src
T * Src
Definition: ImageCuda.hpp:752
mgx::sepIterY::setSizes
void setSizes(const Point3i &, const Point3i &_size)
Definition: ImageCuda.hpp:409
mgx::sepIterX::setSizes
void setSizes(const Point3i &, const Point3i &_size)
Definition: ImageCuda.hpp:363
mgx::YZ
@ YZ
Definition: Cuda.hpp:71
mgx::clipPoint
__device__ __host__ bool clipPoint(const Point3f &wp, const Point3u &clipDo, const Point4f *pn)
Definition: ImageCuda.hpp:146
mgx::checkBounds
int checkBounds(const Point3i &imgSize, const Point3i &base, const Point3i &size)
Definition: ImageCuda.hpp:541
mgx::sepIterX::operator()
__device__ __host__ void operator()(uint idx)
Definition: ImageCuda.hpp:369
mgx::ErodeKernel::acc
int acc
Definition: ImageCuda.hpp:289
mgx::MedianKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:230
mgx::MedianKernel::select
__device__ __host__ ushort select(ushort rank)
Definition: ImageCuda.hpp:187
mgx::MedianKernel
Definition: ImageCuda.hpp:176
mgx::getThreadCount
cuda_EXPORT uint getThreadCount(uint threadPos, uint totThreads)
mgx::MaskKernel::acc
float acc
Definition: ImageCuda.hpp:325
mgx::DilateMaskKernel::mask
const float * mask
Definition: ImageCuda.hpp:253
mgx::sepIterY::stride
Point3i stride
Definition: ImageCuda.hpp:401
mgx::DilateMaskKernel::acc
float acc
Definition: ImageCuda.hpp:252
MByte
#define MByte
Definition: Cuda.hpp:78
mgx::sepIterX::Dst
T * Dst
Definition: ImageCuda.hpp:357
mgx::DilateKernel::acc
int acc
Definition: ImageCuda.hpp:236
mgx::nonSepIter::setSizes
void setSizes(const Point3i _base, const Point3i &_imgSize)
Definition: ImageCuda.hpp:759
mgx::sepIterY::pixOP
opT pixOP
Definition: ImageCuda.hpp:397
mgx::sepIterY::Dst
T * Dst
Definition: ImageCuda.hpp:403
mgx::Point4f
Vector< 4, float > Point4f
Definition: ColorMap.hpp:20
mgx::DilateKernel
Definition: ImageCuda.hpp:234
mgx::AverageKernel
Definition: ImageCuda.hpp:163
mgx::sepIterX::radius
int radius
Definition: ImageCuda.hpp:352
mgx::sepIterY::size
Point3i size
Definition: ImageCuda.hpp:400
mgx::Vector::z
CU_HOST_DEVICE void z(const T &v)
Short access to the third element.
Definition: Vector.hpp:739
mgx::nonSepIter::operator()
__device__ __host__ void operator()(uint idx)
Definition: ImageCuda.hpp:765
mgx::nonSepIter::Dst
T * Dst
Definition: ImageCuda.hpp:753
mgx::sepIterZ::setSizes
void setSizes(const Point3i &, const Point3i &_size)
Definition: ImageCuda.hpp:455
mgx::getXY
__device__ __host__ void getXY(uint idx, const Point3i &size, int &x, int &y)
Definition: ImageCuda.hpp:113
mgx::getXZ
__device__ __host__ void getXZ(uint idx, const Point3i &size, int &x, int &z)
Definition: ImageCuda.hpp:119
mgx::processNonSepOP
int processNonSepOP(opT op, const Point3i &imgSize, const Point3i &radius, const thrust::host_vector< T > &srcData, thrust::host_vector< T > &dstData)
Definition: ImageCuda.hpp:804
mgx::sepIterX::Src
T * Src
Definition: ImageCuda.hpp:356
mgx::ushort
unsigned short ushort
Simpler names for the various containers and iterators.
Definition: Geometry.hpp:42
mgx::sepIterZ::pixOP
opT pixOP
Definition: ImageCuda.hpp:443
mgx::getTotalThreads
uint getTotalThreads(DimEnum dim, const Point3i size)
Definition: ImageCuda.hpp:516
mgx::ErodeLabelsKernel
Definition: ImageCuda.hpp:269
mgx::errMsg
cuda_EXPORT int errMsg(const std::string &s)
mgx::processSepOP
int processSepOP(opT opx, opT opy, opT opz, const Point3i &imgSize, const Point3i &radius, const thrust::host_vector< T > &srcData, thrust::host_vector< T > &dstData)
Definition: ImageCuda.hpp:703
mgx::ErodeMaskKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:318
mgx::AverageKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:172
mgx::AverageKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:168
mgx::HostVf
thrust::host_vector< float > HostVf
Definition: ThrustTypes.hpp:35
mgx::XY
@ XY
Definition: Cuda.hpp:71
mgx::sepIterY::Src
T * Src
Definition: ImageCuda.hpp:402
mgx::getOffset
CU_HOST_DEVICE size_t getOffset(const int x, const int y, const int z, const Point3i &size)
Definition: Geometry.hpp:437
mgx
Distributed matrix library.
Definition: Assert.hpp:26
mgx::nonSepIter::pixOP
opT pixOP
Definition: ImageCuda.hpp:749
mgx::DilateMaskKernel::data
__device__ __host__ void data(int val, int pos)
Definition: ImageCuda.hpp:259
mgx::nonSepIter
Definition: ImageCuda.hpp:747
mgx::flipImage
void flipImage(DimEnum dim, Point3i imgSize, Point3i &flipImgSize, const T &src, T &dst)
Definition: ImageCuda.hpp:487
mgx::DilateMaskKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:257
mgx::MaskKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:330
mgx::swap
void swap(multiset_vector< Key, Compare, Allocator > &v1, multiset_vector< Key, Compare, Allocator > &v2)
Definition: SetVector.hpp:543
mgx::processOP::launch
bool launch(T op, DimEnum dim)
Definition: ImageCuda.hpp:612
mgx::DilateMaskKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:265
mgx::processOP::next
int next()
Definition: ImageCuda.hpp:645
mgx::getStride
__device__ __host__ Point3i getStride(const Point3i &size)
Definition: ImageCuda.hpp:139
mgx::ErodeLabelsKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:273
mgx::DilateMaskKernel
Definition: ImageCuda.hpp:250
mgx::sepIterZ::sepIterZ
sepIterZ(opT _pixOP, int _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
Definition: ImageCuda.hpp:451
mgx::normalizeMaskScale
void normalizeMaskScale(HostVf &mask)
Definition: ImageCuda.hpp:69
mgx::MaskKernel::result
__device__ __host__ T result()
Definition: ImageCuda.hpp:334
mgx::ErodeMaskKernel
Definition: ImageCuda.hpp:303
mgx::processOP
Definition: ImageCuda.hpp:576
Cuda.hpp
mgx::getYZ
__device__ __host__ void getYZ(uint idx, const Point3i &size, int &y, int &z)
Definition: ImageCuda.hpp:125
mgx::MaskKernel::data
__device__ __host__ void data(T val, int pos)
Definition: ImageCuda.hpp:332
mgx::devP
T * devP(thrust::device_vector< T > &DVec)
Definition: Cuda.hpp:81
mgx::ErodeLabelsKernel::acc
int acc
Definition: ImageCuda.hpp:271
mgx::processOP::read
void read(const DT &Dvec, HT &Hvec)
Definition: ImageCuda.hpp:636
mgx::max
T CU_HOST_DEVICE max(const T a, const T b)
Definition: Util.hpp:34
mgx::DilateKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:238
mgx::sepIterZ::stride
Point3i stride
Definition: ImageCuda.hpp:447
mgx::MaskKernel::mask
const float * mask
Definition: ImageCuda.hpp:326
mgx::sepIterX
Definition: ImageCuda.hpp:349
mgx::MedianKernel::MaxMedianMask
static const int MaxMedianMask
Definition: ImageCuda.hpp:178
mgx::Z
@ Z
Definition: Cuda.hpp:71
mgx::ErodeLabelsKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:283
mgx::XZ
@ XZ
Definition: Cuda.hpp:71
mgx::nonSepIter::nonSepIter
nonSepIter(opT _pixOP, Point3i _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
Definition: ImageCuda.hpp:755
mgx::Vector::y
CU_HOST_DEVICE void y(const T &v)
Short access to the second element.
Definition: Vector.hpp:730
mgx::nonSepIter::imgSize
Point3i imgSize
Definition: ImageCuda.hpp:751
mgx::ErodeKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:299
mgx::MedianKernel::data
__device__ __host__ void data(int val, int)
Definition: ImageCuda.hpp:184
mgx::ErodeMaskKernel::acc
float acc
Definition: ImageCuda.hpp:305
mgx::MedianKernel::count
int count
Definition: ImageCuda.hpp:180
mgx::sepIterZ::Src
T * Src
Definition: ImageCuda.hpp:448
mgx::processOP::processOP
processOP(Point3i _imgSize, int _pad, size_t _sizeOf)
Definition: ImageCuda.hpp:579
mgx::XYZ
@ XYZ
Definition: Cuda.hpp:71
mgx::nonSepIter::radius
Point3i radius
Definition: ImageCuda.hpp:750
mgx::sepIterX::stride
Point3i stride
Definition: ImageCuda.hpp:355
mgx::ErodeMaskKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:310
mgx::X
@ X
Definition: Cuda.hpp:71
mgx::progressAdvance
bool progressAdvance()
mgx::processOP::write
void write(const HT &Hvec, DT &Dvec)
Definition: ImageCuda.hpp:602
mgx::getXYZ
__device__ __host__ void getXYZ(uint idx, const Point3i &size, int &x, int &y, int &z)
Definition: ImageCuda.hpp:131
SWAP
#define SWAP(x, y)
mgx::sepIterY
Definition: ImageCuda.hpp:395
mgx::Vector< 3, int >
mgx::ErodeKernel::data
__device__ __host__ void data(int val, int)
Definition: ImageCuda.hpp:293
mgx::ErodeKernel
Definition: ImageCuda.hpp:287
mgx::DilateKernel::result
__device__ __host__ ushort result()
Definition: ImageCuda.hpp:246
mgx::processOP::passes
int passes()
Definition: ImageCuda.hpp:658
mgx::sepIterY::sepIterY
sepIterY(opT _pixOP, int _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
Definition: ImageCuda.hpp:405
mgx::sepIterX::sepIterX
sepIterX(opT _pixOP, int _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
Definition: ImageCuda.hpp:359
mgx::Point3i
Vector< 3, int > Point3i
Definition: Geometry.hpp:68
mgx::userMem
cuda_EXPORT size_t userMem()
mgx::getGaussianMask1D
bool getGaussianMask1D(const float step, const float sigma, HostVf &mask, int &radius)
Definition: ImageCuda.hpp:19
mgx::sepIterZ::Dst
T * Dst
Definition: ImageCuda.hpp:449
mgx::processOP::alloc
void alloc(T &Dvec)
Definition: ImageCuda.hpp:592
mgx::DCountIter
thrust::counting_iterator< int, thrust::device_space_tag > DCountIter
Definition: Cuda.hpp:67
mgx::sepIterX::size
Point3i size
Definition: ImageCuda.hpp:354
mgx::processOP::rewind
void rewind()
Definition: ImageCuda.hpp:652
mgx::ErodeMaskKernel::ErodeMaskKernel
ErodeMaskKernel(const float *_mask)
Definition: ImageCuda.hpp:308
mgx::Vector::x
CU_HOST_DEVICE void x(const T &v)
Short access to the first element.
Definition: Vector.hpp:721
mgx::MedianKernel::init
__device__ __host__ void init()
Definition: ImageCuda.hpp:182
mgx::DimEnum
DimEnum
Definition: Cuda.hpp:71
mgx::offset
CU_HOST_DEVICE size_t offset(uint x, uint y, uint z, uint xsz, uint ysz)
Definition: Geometry.hpp:431
mgx::ErodeMaskKernel::data
__device__ __host__ void data(int val, int pos)
Definition: ImageCuda.hpp:312
mgx::AverageKernel::acc
int acc
Definition: ImageCuda.hpp:165
mgx::sepIterZ::size
Point3i size
Definition: ImageCuda.hpp:446
mgx::sepIterY::operator()
__device__ __host__ void operator()(uint idx)
Definition: ImageCuda.hpp:415
mgx::AverageKernel::count
int count
Definition: ImageCuda.hpp:166
mgx::MedianKernel::array
int array[MaxMedianMask]
Definition: ImageCuda.hpp:179
mgx::getStep
uint getStep(uint pos, uint maxpos, size_t stride, size_t szof, size_t mem)
Definition: ImageCuda.hpp:555
mgx::sepIterZ::operator()
__device__ __host__ void operator()(uint idx)
Definition: ImageCuda.hpp:461
mgx::ErodeMaskKernel::mask
const float * mask
Definition: ImageCuda.hpp:306
mgx::sepIterZ
Definition: ImageCuda.hpp:441
mgx::DilateMaskKernel::DilateMaskKernel
DilateMaskKernel(const float *_mask)
Definition: ImageCuda.hpp:255
mgx::sepIterZ::radius
int radius
Definition: ImageCuda.hpp:444
mgx::sepIterY::radius
int radius
Definition: ImageCuda.hpp:398
mgx::checkCudaError
cuda_EXPORT int checkCudaError(const std::string &msg)
mgx::sepIterX::pixOP
opT pixOP
Definition: ImageCuda.hpp:351