Go to the documentation of this file.
11 #ifndef IMAGE_CUDA_HPP
12 #define IMAGE_CUDA_HPP
22 float sigmaSqr = sigma * sigma;
23 float rad = .5 + 3.0 * sigma / step;
24 radius = (rad <= 0 ? 0 : int(rad));
29 mask.resize(2 * radius + 1);
31 for(
int u = -radius; u <= radius; ++u) {
33 double alpha = exp(-(x * x) / (2.0 * sigmaSqr));
34 float val = alpha / (2.0 * M_PI * sigma);
40 for(
size_t i = 0; i < mask.size(); ++i)
50 mask.resize(size.
x() * size.
y() * size.
z());
53 for(
int z = 0; z < size.
z(); ++z)
54 for(
int y = 0; y < size.
y(); ++y)
55 for(
int x = 0; x < size.
x(); ++x) {
56 float xdist = float(x - radius.
x())/float(radius.
x());
57 float ydist = float(y - radius.
y())/float(radius.
y());
58 float zdist = float(z - radius.
z())/float(radius.
z());
60 if(sqrt(xdist*xdist + ydist*ydist + zdist*zdist) <= 1.0)
75 for(
size_t i = 0; i < mask.size(); ++i)
79 for(
size_t i = 0; i < mask.size(); ++i)
134 y = idx / size.
x() % size.
y();
135 z = idx / size.
x() / size.
y();
141 return (
Point3i(1, size.
x(), size.
x() * size.
y()));
145 __device__ __host__
inline
150 for(
int i = 0; i < 3; i++)
153 for(
int k = 0; k < 2; k++)
154 if(pn[i * 2 + k] * p4 < 0)
191 #define SWAP(x,y) { tmpVal = (x); (x) = (y); (y) = tmpVal; }
194 if(high <= low)
return array[rank];
204 ushort middle = (low + high) / 2;
211 ushort ll = low+1, hh = high;
224 if(rank <= hh) high = hh-1;
225 if(rank >= hh) low = ll;
240 __device__ __host__
void data(
int val,
int)
259 __device__ __host__
void data(
int val,
int pos)
261 if(
mask[pos] > 0 and val >
acc)
275 __device__ __host__
void data(
int val,
int)
291 __device__ __host__
void init() {
acc = 65536; }
293 __device__ __host__
void data(
int val,
int)
310 __device__ __host__
void init() {
acc = 65536; }
312 __device__ __host__
void data(
int val,
int pos)
314 if(
mask[pos] > 0 and val <
acc)
322 template <
typename T>
332 __device__ __host__
void data(T val,
int pos) {
acc += val *
mask[pos]; }
337 if(
acc <= std::numeric_limits<T>::lowest())
338 return std::numeric_limits<T>::lowest();
341 else if(std::numeric_limits<T>::is_integer)
349 template <
typename opT,
typename T>
struct sepIterX
360 thrust::device_vector<T> &_Src, thrust::device_vector<T> &_Dst)
381 int maskinit =
radius - (x - xs);
386 int maskpos = maskinit;
387 for(
int xi = xs; xi < xe; ++xi, xIdx +=
stride.
x())
388 pxOP.data(
Src[xIdx], maskpos++);
389 Dst[zIdx] = pxOP.result();
395 template <
typename opT,
typename T>
struct sepIterY
406 thrust::device_vector<T>& _Src, thrust::device_vector<T>& _Dst)
427 int maskinit =
radius - (y - ys);
431 int maskpos = maskinit;
433 for(
int yi = ys; yi < ye; ++yi, yIdx +=
stride.
y())
434 pxOP.data(
Src[yIdx], maskpos++);
435 Dst[zIdx] = pxOP.result();
441 template <
typename opT,
typename T>
struct sepIterZ
452 thrust::device_vector<T>& _Src, thrust::device_vector<T>& _Dst)
476 int maskpos =
radius - (z - zs);
479 for(
int zi = zs; zi < ze; ++zi, zIdx +=
stride.
z())
480 pxOP.data(
Src[zIdx], maskpos++);
481 Dst[zOff] = pxOP.result();
488 Point3i& flipImgSize,
const T& src, T& dst)
491 flipImgSize =
Point3i(imgSize.
y(), imgSize.
x(), imgSize.
z());
493 for(
int z = 0; z < imgSize.
z(); ++z)
494 for(
int y = 0; y < imgSize.
y(); ++y)
495 for(
int x = 0; x < imgSize.
x(); ++x)
496 dst[
getOffset(y, x, z, flipImgSize)] = src[idx++];
497 }
else if(dim ==
YZ) {
498 flipImgSize =
Point3i(imgSize.
x(), imgSize.
z(), imgSize.
y());
500 for(
int z = 0; z < imgSize.
z(); ++z)
501 for(
int y = 0; y < imgSize.
y(); ++y)
502 for(
int x = 0; x < imgSize.
x(); ++x)
503 dst[
getOffset(x, z, y, flipImgSize)] = src[idx++];
504 }
else if(dim ==
XZ) {
505 flipImgSize =
Point3i(imgSize.
z(), imgSize.
y(), imgSize.
x());
507 for(
int z = 0; z < imgSize.
z(); ++z)
508 for(
int y = 0; y < imgSize.
y(); ++y)
509 for(
int x = 0; x < imgSize.
x(); ++x)
510 dst[
getOffset(z, y, x, flipImgSize)] = src[idx++];
512 std::cout <<
"Error in flipImage: unknown dimension" << std::endl;
526 return size.
x() * size.
y();
528 return size.
x() * size.
z();
530 return size.
y() * size.
z();
532 size_t threads = size_t(size.
x()) * size.
y() * size.
z();
533 if(threads > UINT_MAX)
534 throw(std::string(
"Max threads greater than unsigned integer capacity, reduce holdmem"));
543 if(base.
x() < 0 || base.
y() < 0 || base.
z() < 0)
544 return errMsg(
"checkbounds:base < 0");
545 if(size.
x() <= 0 || size.
y() <= 0 || size.
z() <= 0)
546 return errMsg(
"checkbounds:size = 0");
547 if(base.
x() + size.
x() > imgSize.
x() || base.
y() + size.
y() > imgSize.
y() ||
548 base.
z() + size.
z() > imgSize.
z())
549 return errMsg(
"checkbounds:size > image size");
561 if(stride * szof == 0)
562 throw(std::string(
"getStep: Bad stride (=0)"));
565 uint step = mem / (stride * szof);
567 throw(std::string(
"getStep: Can't process enough data, try increasing CudaHoldMem"));
569 if(pos + step > maxpos)
580 : sizeOf(_sizeOf), imgSize(_imgSize), pad(_pad), baseZ(0)
584 passesReq = ceil(
float(imgSize.
z()) / sizeZ);
586 std::cout <<
"Blocking data to GPU, "
587 << int(1.1 * sizeOf * imgSize.
x() * imgSize.
y() * imgSize.
z() / (1024 * 1024))
588 <<
" MB required, " << mem /
MByte <<
" MB available ("
589 << sizeZ <<
" slices)" << std::endl;
592 template <
typename T>
void alloc(T& Dvec)
595 size_t size = size_t(padSizeZ) * imgSize.
x() * imgSize.
y();
596 if(Dvec.size() != size) {
598 checkCudaError(
"processOp.alloc:Error allocating device vector.");
602 template <
typename HT,
typename DT>
void write(
const HT& Hvec, DT& Dvec)
606 thrust::copy(Hvec.begin() +
size_t(padBaseZ) * imgSize.
x() * imgSize.
y(),
607 Hvec.begin() +
size_t(padBaseZ + padSizeZ) * imgSize.
x() * imgSize.
y(), Dvec.begin());
616 op.setSizes(base,
Point3i(imgSize.
x(), imgSize.
y(), padSizeZ));
620 for(
uint threadPos = 0; threadPos < totThreads; ) {
623 if(maxThreads < threads)
624 maxThreads = threads;
626 DCountIter first(threadPos), last(threadPos + threads);
627 thrust::for_each(first, last, op);
630 return errMsg(
"Operation canceled");
631 threadPos += threads;
636 template <
typename HT,
typename DT>
void read(
const DT& Dvec, HT& Hvec)
639 thrust::copy(Dvec.begin() +
size_t(baseZ - padBaseZ) * imgSize.
x() * imgSize.
y(),
640 Dvec.begin() +
size_t(baseZ - padBaseZ + sizeZ) * imgSize.
x() * imgSize.
y(),
641 Hvec.begin() +
size_t(baseZ) * imgSize.
x() * imgSize.
y());
649 return imgSize.
z() - baseZ;
664 if(baseZ >= imgSize.
z())
674 padSizeZ =
getStep(padBaseZ, imgSize.
z(), imgSize.
x() * imgSize.
y(), sizeOf, mem);
678 if(padBaseZ + padSizeZ < imgSize.
z())
680 sizeZ -= baseZ - padBaseZ;
684 throw(std::string(
"processOP.calcSize: Can't process enough data, "
685 "try increasing cudaHoldMem"));
702 template <
typename opT,
typename T>
704 const thrust::host_vector<T>& srcData, thrust::host_vector<T>& dstData)
706 if(radius.
x() < 0 && radius.
y() < 0 && radius.
z() < 0)
707 throw(std::string(
"processSepOP: At least one radius must be >= 0"));
709 if(radius.
x() == 0 && radius.
y() == 0 && radius.
z() == 0) {
713 thrust::host_vector<T> Hsrc(srcData);
716 processOP proc(imgSize, radius.
z(),
sizeof(T) * 2);
717 thrust::device_vector<T> DVec1, DVec2;
718 thrust::device_vector<T> *Dsrc = &DVec1, *Ddst = &DVec2;
720 proc.
write(Hsrc, *Dsrc);
739 proc.
read(*Dsrc, dstData);
740 }
while(proc.
next());
746 template <
typename opT,
typename T>
756 thrust::device_vector<T> &_Src, thrust::device_vector<T> &_Dst)
784 for(
int z = 0; z <
imgSize.
z(); ++z) {
792 for(
int zi = zs; zi < ze; ++zi)
793 for(
int yi = ys; yi < ye; ++yi)
794 for(
int xi = xs; xi < xe; ++xi)
803 template <
typename opT,
typename T>
805 const thrust::host_vector<T> &srcData, thrust::host_vector<T> &dstData)
807 if(radius.
x() < 0 && radius.
y() < 0 && radius.
z() < 0)
808 throw(std::string(
"processSepOP: At least one radius must be >= 0"));
810 if(radius.
x() == 0 && radius.
y() == 0 && radius.
z() == 0) {
814 thrust::host_vector<T> Hsrc(srcData);
817 processOP proc(imgSize, radius.
z(),
sizeof(T) * 2);
818 thrust::device_vector<T> DVec1, DVec2;
819 thrust::device_vector<T>* Dsrc = &DVec1, *Ddst = &DVec2;
821 proc.
write(Hsrc, *Dsrc);
828 proc.
read(*Dsrc, dstData);
829 }
while(proc.
next());
__device__ __host__ void data(int val, int)
__device__ __host__ void data(int val, int)
MaskKernel(const float *_mask)
__device__ __host__ void data(int val, int)
bool getRoundMask3D(const Point3i &radius, HostVf &mask)
__device__ __host__ void init()
void setSizes(const Point3i &, const Point3i &_size)
void setSizes(const Point3i &, const Point3i &_size)
__device__ __host__ bool clipPoint(const Point3f &wp, const Point3u &clipDo, const Point4f *pn)
int checkBounds(const Point3i &imgSize, const Point3i &base, const Point3i &size)
__device__ __host__ void operator()(uint idx)
cuda_EXPORT uint getThreadCount(uint threadPos, uint totThreads)
void setSizes(const Point3i _base, const Point3i &_imgSize)
Vector< 4, float > Point4f
CU_HOST_DEVICE void z(const T &v)
Short access to the third element.
__device__ __host__ void operator()(uint idx)
void setSizes(const Point3i &, const Point3i &_size)
__device__ __host__ void getXY(uint idx, const Point3i &size, int &x, int &y)
__device__ __host__ void getXZ(uint idx, const Point3i &size, int &x, int &z)
int processNonSepOP(opT op, const Point3i &imgSize, const Point3i &radius, const thrust::host_vector< T > &srcData, thrust::host_vector< T > &dstData)
unsigned short ushort
Simpler names for the various containers and iterators.
uint getTotalThreads(DimEnum dim, const Point3i size)
cuda_EXPORT int errMsg(const std::string &s)
int processSepOP(opT opx, opT opy, opT opz, const Point3i &imgSize, const Point3i &radius, const thrust::host_vector< T > &srcData, thrust::host_vector< T > &dstData)
__device__ __host__ ushort result()
__device__ __host__ ushort result()
__device__ __host__ void init()
thrust::host_vector< float > HostVf
CU_HOST_DEVICE size_t getOffset(const int x, const int y, const int z, const Point3i &size)
Distributed matrix library.
__device__ __host__ void data(int val, int pos)
void flipImage(DimEnum dim, Point3i imgSize, Point3i &flipImgSize, const T &src, T &dst)
__device__ __host__ void init()
__device__ __host__ void init()
void swap(multiset_vector< Key, Compare, Allocator > &v1, multiset_vector< Key, Compare, Allocator > &v2)
bool launch(T op, DimEnum dim)
__device__ __host__ ushort result()
__device__ __host__ Point3i getStride(const Point3i &size)
__device__ __host__ void init()
sepIterZ(opT _pixOP, int _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
void normalizeMaskScale(HostVf &mask)
__device__ __host__ T result()
__device__ __host__ void getYZ(uint idx, const Point3i &size, int &y, int &z)
__device__ __host__ void data(T val, int pos)
T * devP(thrust::device_vector< T > &DVec)
void read(const DT &Dvec, HT &Hvec)
T CU_HOST_DEVICE max(const T a, const T b)
__device__ __host__ void init()
__device__ __host__ ushort result()
nonSepIter(opT _pixOP, Point3i _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
CU_HOST_DEVICE void y(const T &v)
Short access to the second element.
__device__ __host__ ushort result()
processOP(Point3i _imgSize, int _pad, size_t _sizeOf)
__device__ __host__ void init()
void write(const HT &Hvec, DT &Dvec)
__device__ __host__ void getXYZ(uint idx, const Point3i &size, int &x, int &y, int &z)
__device__ __host__ void data(int val, int)
__device__ __host__ ushort result()
sepIterY(opT _pixOP, int _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
sepIterX(opT _pixOP, int _radius, thrust::device_vector< T > &_Src, thrust::device_vector< T > &_Dst)
cuda_EXPORT size_t userMem()
bool getGaussianMask1D(const float step, const float sigma, HostVf &mask, int &radius)
thrust::counting_iterator< int, thrust::device_space_tag > DCountIter
ErodeMaskKernel(const float *_mask)
CU_HOST_DEVICE void x(const T &v)
Short access to the first element.
CU_HOST_DEVICE size_t offset(uint x, uint y, uint z, uint xsz, uint ysz)
__device__ __host__ void data(int val, int pos)
__device__ __host__ void operator()(uint idx)
uint getStep(uint pos, uint maxpos, size_t stride, size_t szof, size_t mem)
__device__ __host__ void operator()(uint idx)
DilateMaskKernel(const float *_mask)
cuda_EXPORT int checkCudaError(const std::string &msg)