// Copyright (c) 2022, ETH Zurich and UNC Chapel Hill. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of // its contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) #ifndef COLMAP_SRC_MVS_GPU_MAT_H_ #define COLMAP_SRC_MVS_GPU_MAT_H_ #include #include #include #include #include #include #include "mvs/cuda_flip.h" #include "mvs/cuda_rotate.h" #include "mvs/cuda_transpose.h" #include "mvs/mat.h" #include "util/cuda.h" #include "util/cudacc.h" #include "util/endian.h" namespace colmap { namespace mvs { template class GpuMat { public: GpuMat(const size_t width, const size_t height, const size_t depth = 1); ~GpuMat(); __host__ __device__ const T* GetPtr() const; __host__ __device__ T* GetPtr(); __host__ __device__ size_t GetPitch() const; __host__ __device__ size_t GetWidth() const; __host__ __device__ size_t GetHeight() const; __host__ __device__ size_t GetDepth() const; __device__ T Get(const size_t row, const size_t col, const size_t slice = 0) const; __device__ void GetSlice(const size_t row, const size_t col, T* values) const; __device__ T& GetRef(const size_t row, const size_t col); __device__ T& GetRef(const size_t row, const size_t col, const size_t slice); __device__ void Set(const size_t row, const size_t col, const T value); __device__ void Set(const size_t row, const size_t col, const size_t slice, const T value); __device__ void SetSlice(const size_t row, const size_t col, const T* values); void FillWithScalar(const T value); void FillWithVector(const T* values); void FillWithRandomNumbers(const T min_value, const T max_value, GpuMat random_state); void CopyToDevice(const T* data, const size_t pitch); void CopyToHost(T* data, const size_t pitch) const; Mat CopyToMat() const; // Transpose array by swapping x and y coordinates. void Transpose(GpuMat* output); // Flip array along vertical axis. void FlipHorizontal(GpuMat* output); // Rotate array in counter-clockwise direction. void Rotate(GpuMat* output); void Read(const std::string& path); void Write(const std::string& path); void Write(const std::string& path, const size_t slice); protected: void ComputeCudaConfig(); const static size_t kBlockDimX = 32; const static size_t kBlockDimY = 16; std::shared_ptr array_; T* array_ptr_; size_t pitch_; size_t width_; size_t height_; size_t depth_; dim3 blockSize_; dim3 gridSize_; }; //////////////////////////////////////////////////////////////////////////////// // Implementation //////////////////////////////////////////////////////////////////////////////// #ifdef __CUDACC__ namespace internal { template __global__ void FillWithScalarKernel(GpuMat output, const T value) { const size_t row = blockIdx.y * blockDim.y + threadIdx.y; const size_t col = blockIdx.x * blockDim.x + threadIdx.x; if (row < output.GetHeight() && col < output.GetWidth()) { for (size_t slice = 0; slice < output.GetDepth(); ++slice) { output.Set(row, col, slice, value); } } } template __global__ void FillWithVectorKernel(const T* values, GpuMat output) { const size_t row = blockIdx.y * blockDim.y + threadIdx.y; const size_t col = blockIdx.x * blockDim.x + threadIdx.x; if (row < output.GetHeight() && col < output.GetWidth()) { for (size_t slice = 0; slice < output.GetDepth(); ++slice) { output.Set(row, col, slice, values[slice]); } } } template __global__ void FillWithRandomNumbersKernel(GpuMat output, GpuMat random_state, const T min_value, const T max_value) { const size_t row = blockIdx.y * blockDim.y + threadIdx.y; const size_t col = blockIdx.x * blockDim.x + threadIdx.x; if (row < output.GetHeight() && col < output.GetWidth()) { curandState local_state = random_state.Get(row, col); for (size_t slice = 0; slice < output.GetDepth(); ++slice) { const T random_value = curand_uniform(&local_state) * (max_value - min_value) + min_value; output.Set(row, col, slice, random_value); } random_state.Set(row, col, local_state); } } } // namespace internal template GpuMat::GpuMat(const size_t width, const size_t height, const size_t depth) : array_(nullptr), array_ptr_(nullptr), width_(width), height_(height), depth_(depth) { CUDA_SAFE_CALL(cudaMallocPitch((void**)&array_ptr_, &pitch_, width_ * sizeof(T), height_ * depth_)); array_ = std::shared_ptr(array_ptr_, cudaFree); ComputeCudaConfig(); } template GpuMat::~GpuMat() { array_.reset(); array_ptr_ = nullptr; pitch_ = 0; width_ = 0; height_ = 0; depth_ = 0; } template __host__ __device__ const T* GpuMat::GetPtr() const { return array_ptr_; } template __host__ __device__ T* GpuMat::GetPtr() { return array_ptr_; } template __host__ __device__ size_t GpuMat::GetPitch() const { return pitch_; } template __host__ __device__ size_t GpuMat::GetWidth() const { return width_; } template __host__ __device__ size_t GpuMat::GetHeight() const { return height_; } template __host__ __device__ size_t GpuMat::GetDepth() const { return depth_; } template __device__ T GpuMat::Get(const size_t row, const size_t col, const size_t slice) const { return *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col); } template __device__ void GpuMat::GetSlice(const size_t row, const size_t col, T* values) const { for (size_t slice = 0; slice < depth_; ++slice) { values[slice] = Get(row, col, slice); } } template __device__ T& GpuMat::GetRef(const size_t row, const size_t col) { return GetRef(row, col, 0); } template __device__ T& GpuMat::GetRef(const size_t row, const size_t col, const size_t slice) { return *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col); } template __device__ void GpuMat::Set(const size_t row, const size_t col, const T value) { Set(row, col, 0, value); } template __device__ void GpuMat::Set(const size_t row, const size_t col, const size_t slice, const T value) { *((T*)((char*)array_ptr_ + pitch_ * (slice * height_ + row)) + col) = value; } template __device__ void GpuMat::SetSlice(const size_t row, const size_t col, const T* values) { for (size_t slice = 0; slice < depth_; ++slice) { Set(row, col, slice, values[slice]); } } template void GpuMat::FillWithScalar(const T value) { internal::FillWithScalarKernel<<>>(*this, value); CUDA_SYNC_AND_CHECK(); } template void GpuMat::FillWithVector(const T* values) { T* values_device; CUDA_SAFE_CALL(cudaMalloc((void**)&values_device, depth_ * sizeof(T))); CUDA_SAFE_CALL(cudaMemcpy(values_device, values, depth_ * sizeof(T), cudaMemcpyHostToDevice)); internal::FillWithVectorKernel <<>>(values_device, *this); CUDA_SYNC_AND_CHECK(); CUDA_SAFE_CALL(cudaFree(values_device)); } template void GpuMat::FillWithRandomNumbers(const T min_value, const T max_value, const GpuMat random_state) { internal::FillWithRandomNumbersKernel <<>>(*this, random_state, min_value, max_value); CUDA_SYNC_AND_CHECK(); } template void GpuMat::CopyToDevice(const T* data, const size_t pitch) { CUDA_SAFE_CALL(cudaMemcpy2D((void*)array_ptr_, (size_t)pitch_, (void*)data, pitch, width_ * sizeof(T), height_ * depth_, cudaMemcpyHostToDevice)); } template void GpuMat::CopyToHost(T* data, const size_t pitch) const { CUDA_SAFE_CALL(cudaMemcpy2D((void*)data, pitch, (void*)array_ptr_, (size_t)pitch_, width_ * sizeof(T), height_ * depth_, cudaMemcpyDeviceToHost)); } template Mat GpuMat::CopyToMat() const { Mat mat(width_, height_, depth_); CopyToHost(mat.GetPtr(), mat.GetWidth() * sizeof(T)); return mat; } template void GpuMat::Transpose(GpuMat* output) { for (size_t slice = 0; slice < depth_; ++slice) { CudaTranspose(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(), output->GetPtr() + slice * output->pitch_ / sizeof(T) * output->GetHeight(), width_, height_, pitch_, output->pitch_); } CUDA_SYNC_AND_CHECK(); } template void GpuMat::FlipHorizontal(GpuMat* output) { for (size_t slice = 0; slice < depth_; ++slice) { CudaFlipHorizontal(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(), output->GetPtr() + slice * output->pitch_ / sizeof(T) * output->GetHeight(), width_, height_, pitch_, output->pitch_); } CUDA_SYNC_AND_CHECK(); } template void GpuMat::Rotate(GpuMat* output) { for (size_t slice = 0; slice < depth_; ++slice) { CudaRotate((T*)((char*)array_ptr_ + slice * pitch_ * GetHeight()), (T*)((char*)output->GetPtr() + slice * output->pitch_ * output->GetHeight()), width_, height_, pitch_, output->pitch_); } CUDA_SYNC_AND_CHECK(); // This is equivalent to the following code: // GpuMat flipped_array(width_, height_, GetDepth()); // FlipHorizontal(&flipped_array); // flipped_array.Transpose(output); } template void GpuMat::Read(const std::string& path) { std::fstream text_file(path, std::ios::in | std::ios::binary); CHECK(text_file.is_open()) << path; size_t width; size_t height; size_t depth; char unused_char; text_file >> width >> unused_char >> height >> unused_char >> depth >> unused_char; std::streampos pos = text_file.tellg(); text_file.close(); std::fstream binary_file(path, std::ios::in | std::ios::binary); binary_file.seekg(pos); std::vector source(width_ * height_ * depth_); ReadBinaryLittleEndian(&binary_file, &source); binary_file.close(); CopyToDevice(source.data(), width_ * sizeof(T)); } template void GpuMat::Write(const std::string& path) { std::vector dest(width_ * height_ * depth_); CopyToHost(dest.data(), width_ * sizeof(T)); std::fstream text_file(path, std::ios::out); text_file << width_ << "&" << height_ << "&" << depth_ << "&"; text_file.close(); std::fstream binary_file(path, std::ios::out | std::ios::binary | std::ios::app); WriteBinaryLittleEndian(&binary_file, dest); binary_file.close(); } template void GpuMat::Write(const std::string& path, const size_t slice) { std::vector dest(width_ * height_); CUDA_SAFE_CALL(cudaMemcpy2D( (void*)dest.data(), width_ * sizeof(T), (void*)(array_ptr_ + slice * height_ * pitch_ / sizeof(T)), pitch_, width_ * sizeof(T), height_, cudaMemcpyDeviceToHost)); std::fstream text_file(path, std::ios::out); text_file << width_ << "&" << height_ << "&" << 1 << "&"; text_file.close(); std::fstream binary_file(path, std::ios::out | std::ios::binary | std::ios::app); WriteBinaryLittleEndian(&binary_file, dest); binary_file.close(); } template void GpuMat::ComputeCudaConfig() { blockSize_.x = kBlockDimX; blockSize_.y = kBlockDimY; blockSize_.z = 1; gridSize_.x = (width_ - 1) / kBlockDimX + 1; gridSize_.y = (height_ - 1) / kBlockDimY + 1; gridSize_.z = 1; } #endif // __CUDACC__ } // namespace mvs } // namespace colmap #endif // COLMAP_SRC_MVS_GPU_MAT_H_