| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #ifndef COLMAP_SRC_MVS_GPU_MAT_H_ |
| | #define COLMAP_SRC_MVS_GPU_MAT_H_ |
| |
|
| | #include <fstream> |
| | #include <iterator> |
| | #include <memory> |
| | #include <string> |
| |
|
| | #include <cuda_runtime.h> |
| | #include <curand_kernel.h> |
| |
|
| | #include "mvs/cuda_flip.h" |
| | #include "mvs/cuda_rotate.h" |
| | #include "mvs/cuda_transpose.h" |
| | #include "mvs/mat.h" |
| | #include "util/cuda.h" |
| | #include "util/cudacc.h" |
| | #include "util/endian.h" |
| |
|
| | namespace colmap { |
| | namespace mvs { |
| |
|
| | template <typename T> |
| | class GpuMat { |
| | public: |
| | GpuMat(const size_t width, const size_t height, const size_t depth = 1); |
| | ~GpuMat(); |
| |
|
| | __host__ __device__ const T* GetPtr() const; |
| | __host__ __device__ T* GetPtr(); |
| |
|
| | __host__ __device__ size_t GetPitch() const; |
| | __host__ __device__ size_t GetWidth() const; |
| | __host__ __device__ size_t GetHeight() const; |
| | __host__ __device__ size_t GetDepth() const; |
| |
|
| | __device__ T Get(const size_t row, const size_t col, |
| | const size_t slice = 0) const; |
| | __device__ void GetSlice(const size_t row, const size_t col, T* values) const; |
| |
|
| | __device__ T& GetRef(const size_t row, const size_t col); |
| | __device__ T& GetRef(const size_t row, const size_t col, const size_t slice); |
| |
|
| | __device__ void Set(const size_t row, const size_t col, const T value); |
| | __device__ void Set(const size_t row, const size_t col, const size_t slice, |
| | const T value); |
| | __device__ void SetSlice(const size_t row, const size_t col, const T* values); |
| |
|
| | void FillWithScalar(const T value); |
| | void FillWithVector(const T* values); |
| | void FillWithRandomNumbers(const T min_value, const T max_value, |
| | GpuMat<curandState> random_state); |
| |
|
| | void CopyToDevice(const T* data, const size_t pitch); |
| | void CopyToHost(T* data, const size_t pitch) const; |
| | Mat<T> CopyToMat() const; |
| |
|
| | |
| | void Transpose(GpuMat<T>* output); |
| |
|
| | |
| | void FlipHorizontal(GpuMat<T>* output); |
| |
|
| | |
| | void Rotate(GpuMat<T>* output); |
| |
|
| | void Read(const std::string& path); |
| | void Write(const std::string& path); |
| | void Write(const std::string& path, const size_t slice); |
| |
|
| | protected: |
| | void ComputeCudaConfig(); |
| |
|
| | const static size_t kBlockDimX = 32; |
| | const static size_t kBlockDimY = 16; |
| |
|
| | std::shared_ptr<T> array_; |
| | T* array_ptr_; |
| |
|
| | size_t pitch_; |
| | size_t width_; |
| | size_t height_; |
| | size_t depth_; |
| |
|
| | dim3 blockSize_; |
| | dim3 gridSize_; |
| | }; |
| |
|
| | |
| | |
| | |
| |
|
| | #ifdef __CUDACC__ |
| |
|
| | namespace internal { |
| |
|
| | template <typename T> |
| | __global__ void FillWithScalarKernel(GpuMat<T> output, const T value) { |
| | const size_t row = blockIdx.y * blockDim.y + threadIdx.y; |
| | const size_t col = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (row < output.GetHeight() && col < output.GetWidth()) { |
| | for (size_t slice = 0; slice < output.GetDepth(); ++slice) { |
| | output.Set(row, col, slice, value); |
| | } |
| | } |
| | } |
| |
|
| | template <typename T> |
| | __global__ void FillWithVectorKernel(const T* values, GpuMat<T> output) { |
| | const size_t row = blockIdx.y * blockDim.y + threadIdx.y; |
| | const size_t col = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (row < output.GetHeight() && col < output.GetWidth()) { |
| | for (size_t slice = 0; slice < output.GetDepth(); ++slice) { |
| | output.Set(row, col, slice, values[slice]); |
| | } |
| | } |
| | } |
| |
|
| | template <typename T> |
| | __global__ void FillWithRandomNumbersKernel(GpuMat<T> output, |
| | GpuMat<curandState> random_state, |
| | const T min_value, |
| | const T max_value) { |
| | const size_t row = blockIdx.y * blockDim.y + threadIdx.y; |
| | const size_t col = blockIdx.x * blockDim.x + threadIdx.x; |
| | if (row < output.GetHeight() && col < output.GetWidth()) { |
| | curandState local_state = random_state.Get(row, col); |
| | for (size_t slice = 0; slice < output.GetDepth(); ++slice) { |
| | const T random_value = |
| | curand_uniform(&local_state) * (max_value - min_value) + min_value; |
| | output.Set(row, col, slice, random_value); |
| | } |
| | random_state.Set(row, col, local_state); |
| | } |
| | } |
| |
|
| | } |
| |
|
| | template <typename T> |
| | GpuMat<T>::GpuMat(const size_t width, const size_t height, const size_t depth) |
| | : array_(nullptr), |
| | array_ptr_(nullptr), |
| | width_(width), |
| | height_(height), |
| | depth_(depth) { |
| | CUDA_SAFE_CALL(cudaMallocPitch((void**)&array_ptr_, &pitch_, |
| | width_ * sizeof(T), height_ * depth_)); |
| |
|
| | array_ = std::shared_ptr<T>(array_ptr_, cudaFree); |
| |
|
| | ComputeCudaConfig(); |
| | } |
| |
|
| | template <typename T> |
| | GpuMat<T>::~GpuMat() { |
| | array_.reset(); |
| | array_ptr_ = nullptr; |
| | pitch_ = 0; |
| | width_ = 0; |
| | height_ = 0; |
| | depth_ = 0; |
| | } |
| |
|
| | template <typename T> |
| | __host__ __device__ const T* GpuMat<T>::GetPtr() const { |
| | return array_ptr_; |
| | } |
| |
|
| | template <typename T> |
| | __host__ __device__ T* GpuMat<T>::GetPtr() { |
| | return array_ptr_; |
| | } |
| |
|
| | template <typename T> |
| | __host__ __device__ size_t GpuMat<T>::GetPitch() const { |
| | return pitch_; |
| | } |
| |
|
| | template <typename T> |
| | __host__ __device__ size_t GpuMat<T>::GetWidth() const { |
| | return width_; |
| | } |
| |
|
| | template <typename T> |
| | __host__ __device__ size_t GpuMat<T>::GetHeight() const { |
| | return height_; |
| | } |
| |
|
| | template <typename T> |
| | __host__ __device__ size_t GpuMat<T>::GetDepth() const { |
| | return depth_; |
| | } |
| |
|
| | template <typename T> |
| | __device__ T GpuMat<T>::Get(const size_t row, const size_t col, |
| | const size_t slice) const { |
| | return *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col); |
| | } |
| |
|
| | template <typename T> |
| | __device__ void GpuMat<T>::GetSlice(const size_t row, const size_t col, |
| | T* values) const { |
| | for (size_t slice = 0; slice < depth_; ++slice) { |
| | values[slice] = Get(row, col, slice); |
| | } |
| | } |
| |
|
| | template <typename T> |
| | __device__ T& GpuMat<T>::GetRef(const size_t row, const size_t col) { |
| | return GetRef(row, col, 0); |
| | } |
| |
|
| | template <typename T> |
| | __device__ T& GpuMat<T>::GetRef(const size_t row, const size_t col, |
| | const size_t slice) { |
| | return *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col); |
| | } |
| |
|
| | template <typename T> |
| | __device__ void GpuMat<T>::Set(const size_t row, const size_t col, |
| | const T value) { |
| | Set(row, col, 0, value); |
| | } |
| |
|
| | template <typename T> |
| | __device__ void GpuMat<T>::Set(const size_t row, const size_t col, |
| | const size_t slice, const T value) { |
| | *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col) = value; |
| | } |
| |
|
| | template <typename T> |
| | __device__ void GpuMat<T>::SetSlice(const size_t row, const size_t col, |
| | const T* values) { |
| | for (size_t slice = 0; slice < depth_; ++slice) { |
| | Set(row, col, slice, values[slice]); |
| | } |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::FillWithScalar(const T value) { |
| | internal::FillWithScalarKernel<T><<<gridSize_, blockSize_>>>(*this, value); |
| | CUDA_SYNC_AND_CHECK(); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::FillWithVector(const T* values) { |
| | T* values_device; |
| | CUDA_SAFE_CALL(cudaMalloc((void**)&values_device, depth_ * sizeof(T))); |
| | CUDA_SAFE_CALL(cudaMemcpy(values_device, values, depth_ * sizeof(T), |
| | cudaMemcpyHostToDevice)); |
| | internal::FillWithVectorKernel<T> |
| | <<<gridSize_, blockSize_>>>(values_device, *this); |
| | CUDA_SYNC_AND_CHECK(); |
| | CUDA_SAFE_CALL(cudaFree(values_device)); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::FillWithRandomNumbers(const T min_value, const T max_value, |
| | const GpuMat<curandState> random_state) { |
| | internal::FillWithRandomNumbersKernel<T> |
| | <<<gridSize_, blockSize_>>>(*this, random_state, min_value, max_value); |
| | CUDA_SYNC_AND_CHECK(); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::CopyToDevice(const T* data, const size_t pitch) { |
| | CUDA_SAFE_CALL(cudaMemcpy2D((void*)array_ptr_, (size_t)pitch_, (void*)data, |
| | pitch, width_ * sizeof(T), height_ * depth_, |
| | cudaMemcpyHostToDevice)); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::CopyToHost(T* data, const size_t pitch) const { |
| | CUDA_SAFE_CALL(cudaMemcpy2D((void*)data, pitch, (void*)array_ptr_, |
| | (size_t)pitch_, width_ * sizeof(T), |
| | height_ * depth_, cudaMemcpyDeviceToHost)); |
| | } |
| |
|
| | template <typename T> |
| | Mat<T> GpuMat<T>::CopyToMat() const { |
| | Mat<T> mat(width_, height_, depth_); |
| | CopyToHost(mat.GetPtr(), mat.GetWidth() * sizeof(T)); |
| | return mat; |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::Transpose(GpuMat<T>* output) { |
| | for (size_t slice = 0; slice < depth_; ++slice) { |
| | CudaTranspose(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(), |
| | output->GetPtr() + |
| | slice * output->pitch_ / sizeof(T) * output->GetHeight(), |
| | width_, height_, pitch_, output->pitch_); |
| | } |
| | CUDA_SYNC_AND_CHECK(); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::FlipHorizontal(GpuMat<T>* output) { |
| | for (size_t slice = 0; slice < depth_; ++slice) { |
| | CudaFlipHorizontal(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(), |
| | output->GetPtr() + slice * output->pitch_ / sizeof(T) * |
| | output->GetHeight(), |
| | width_, height_, pitch_, output->pitch_); |
| | } |
| | CUDA_SYNC_AND_CHECK(); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::Rotate(GpuMat<T>* output) { |
| | for (size_t slice = 0; slice < depth_; ++slice) { |
| | CudaRotate((T*)((char*)array_ptr_ + slice * pitch_ * GetHeight()), |
| | (T*)((char*)output->GetPtr() + |
| | slice * output->pitch_ * output->GetHeight()), |
| | width_, height_, pitch_, output->pitch_); |
| | } |
| | CUDA_SYNC_AND_CHECK(); |
| | |
| | |
| | |
| | |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::Read(const std::string& path) { |
| | std::fstream text_file(path, std::ios::in | std::ios::binary); |
| | CHECK(text_file.is_open()) << path; |
| |
|
| | size_t width; |
| | size_t height; |
| | size_t depth; |
| | char unused_char; |
| | text_file >> width >> unused_char >> height >> unused_char >> depth >> |
| | unused_char; |
| | std::streampos pos = text_file.tellg(); |
| | text_file.close(); |
| |
|
| | std::fstream binary_file(path, std::ios::in | std::ios::binary); |
| | binary_file.seekg(pos); |
| |
|
| | std::vector<T> source(width_ * height_ * depth_); |
| | ReadBinaryLittleEndian<T>(&binary_file, &source); |
| | binary_file.close(); |
| |
|
| | CopyToDevice(source.data(), width_ * sizeof(T)); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::Write(const std::string& path) { |
| | std::vector<T> dest(width_ * height_ * depth_); |
| | CopyToHost(dest.data(), width_ * sizeof(T)); |
| |
|
| | std::fstream text_file(path, std::ios::out); |
| | text_file << width_ << "&" << height_ << "&" << depth_ << "&"; |
| | text_file.close(); |
| |
|
| | std::fstream binary_file(path, |
| | std::ios::out | std::ios::binary | std::ios::app); |
| | WriteBinaryLittleEndian<T>(&binary_file, dest); |
| | binary_file.close(); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::Write(const std::string& path, const size_t slice) { |
| | std::vector<T> dest(width_ * height_); |
| | CUDA_SAFE_CALL(cudaMemcpy2D( |
| | (void*)dest.data(), width_ * sizeof(T), |
| | (void*)(array_ptr_ + slice * height_ * pitch_ / sizeof(T)), pitch_, |
| | width_ * sizeof(T), height_, cudaMemcpyDeviceToHost)); |
| |
|
| | std::fstream text_file(path, std::ios::out); |
| | text_file << width_ << "&" << height_ << "&" << 1 << "&"; |
| | text_file.close(); |
| |
|
| | std::fstream binary_file(path, |
| | std::ios::out | std::ios::binary | std::ios::app); |
| | WriteBinaryLittleEndian<T>(&binary_file, dest); |
| | binary_file.close(); |
| | } |
| |
|
| | template <typename T> |
| | void GpuMat<T>::ComputeCudaConfig() { |
| | blockSize_.x = kBlockDimX; |
| | blockSize_.y = kBlockDimY; |
| | blockSize_.z = 1; |
| |
|
| | gridSize_.x = (width_ - 1) / kBlockDimX + 1; |
| | gridSize_.y = (height_ - 1) / kBlockDimY + 1; |
| | gridSize_.z = 1; |
| | } |
| |
|
| | #endif |
| |
|
| | } |
| | } |
| |
|
| | #endif |
| |
|