#include "DivisionAnalysisCuda.hpp"

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>

#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>

#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>

namespace mgx {

void invert_device(float* src_d, float* dst_d, int n)
{
    cublasHandle_t handle;
    cublasCreate(&handle);

    int batchSize = 1;

    int *P, *INFO;

    cudaMalloc<int>(&P,n * batchSize * sizeof(int));
    cudaMalloc<int>(&INFO,batchSize * sizeof(int));

    int lda = n;

    float *A = { src_d };
    //const float** A_dc;
    float** A_d;
    cudaMalloc<float*>(&A_d,sizeof(A));
    cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice);


    cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize);

    int INFOh = 0;
    cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost);

    if(INFOh == n)
    {
        fprintf(stderr, "Factorization Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    float* C[] = { dst_d };
    float** C_d;
    cudaMalloc<float*>(&C_d,sizeof(C));
    cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice);

#if CUDA_VERSION >= 6050
    cublasSgetriBatched(handle,n,(const float**)A_d,lda,P,C_d,lda,INFO,batchSize);
#else // CUDA_VERSION < 6050
    cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize);
#endif // CUDA_VERSION


    cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost);

    if(INFOh != 0)
    {
        fprintf(stderr, "Inversion Failed: Matrix is singular\n");
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }

    cudaFree(P), cudaFree(INFO), cublasDestroy(handle);
}

void invert(float* src, float* dst, int n)
{
    float* src_d, *dst_d;

    cudaMalloc<float>(&src_d,n * n * sizeof(float));
    cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice);
    cudaMalloc<float>(&dst_d,n * n * sizeof(float));

    invert_device(src_d,dst_d,n);

    cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost);

    cudaFree(src_d);
    cudaFree(dst_d);
}


}

using namespace mgx;


//--------------------------------------------------------------------------
// Externally exported functions, must defined same as in LyonCudaExport.hpp
extern "C" 
{
cuda_EXPORT
void matrixInverseGPU(std::vector<std::vector<double> >& matrix, std::vector<std::vector<double> >& inverse)
{
    const int n = matrix.size();
    float* mat;
    float* inv;
    mat = (float *)malloc(n*n*sizeof(float));
    inv = (float *)malloc(n*n*sizeof(float));
    //std::cout << "bla " << std::endl;

    for(int i=0; i<n; i++){
      for(int j=0; j<n; j++){
        *(mat+i*n+j) = matrix[i][j];
      }
    }

    invert(mat,inv,n);

    for(int i=0; i<n; i++){
      for(int j=0; j<n; j++){
        inverse[i][j] = *(inv+i*n+j);
      }
    }

}

cuda_EXPORT
void sortVectorGPU(std::vector<float>& vec)
{
  int sizeVec = vec.size();
  thrust::device_vector<float> d_vec(sizeVec, 0.);

  for(int i = 0; i<sizeVec; i++){
    d_vec[i] = vec[i];
  }

  thrust::sort(d_vec.begin(), d_vec.end());

  for(int i = 0; i<sizeVec; i++){
    vec[i] = d_vec[i];
  }

}

	
};