ceres-solver and colmap

7b7496d about 3 years ago

14 kB

	// Copyright (c) 2022, ETH Zurich and UNC Chapel Hill.
	// All rights reserved.
	//
	// Redistribution and use in source and binary forms, with or without
	// modification, are permitted provided that the following conditions are met:
	//
	// * Redistributions of source code must retain the above copyright
	// notice, this list of conditions and the following disclaimer.
	//
	// * Redistributions in binary form must reproduce the above copyright
	// notice, this list of conditions and the following disclaimer in the
	// documentation and/or other materials provided with the distribution.
	//
	// * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of
	// its contributors may be used to endorse or promote products derived
	// from this software without specific prior written permission.
	//
	// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
	// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	// POSSIBILITY OF SUCH DAMAGE.
	//
	// Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de)

	#ifndef COLMAP_SRC_MVS_GPU_MAT_H_
	#define COLMAP_SRC_MVS_GPU_MAT_H_

	#include <fstream>
	#include <iterator>
	#include <memory>
	#include <string>

	#include <cuda_runtime.h>
	#include <curand_kernel.h>

	#include "mvs/cuda_flip.h"
	#include "mvs/cuda_rotate.h"
	#include "mvs/cuda_transpose.h"
	#include "mvs/mat.h"
	#include "util/cuda.h"
	#include "util/cudacc.h"
	#include "util/endian.h"

	namespace colmap {
	namespace mvs {

	template <typename T>
	class GpuMat {
	public:
	GpuMat(const size_t width, const size_t height, const size_t depth = 1);
	~GpuMat();

	__host__ __device__ const T* GetPtr() const;
	__host__ __device__ T* GetPtr();

	__host__ __device__ size_t GetPitch() const;
	__host__ __device__ size_t GetWidth() const;
	__host__ __device__ size_t GetHeight() const;
	__host__ __device__ size_t GetDepth() const;

	__device__ T Get(const size_t row, const size_t col,
	const size_t slice = 0) const;
	__device__ void GetSlice(const size_t row, const size_t col, T* values) const;

	__device__ T& GetRef(const size_t row, const size_t col);
	__device__ T& GetRef(const size_t row, const size_t col, const size_t slice);

	__device__ void Set(const size_t row, const size_t col, const T value);
	__device__ void Set(const size_t row, const size_t col, const size_t slice,
	const T value);
	__device__ void SetSlice(const size_t row, const size_t col, const T* values);

	void FillWithScalar(const T value);
	void FillWithVector(const T* values);
	void FillWithRandomNumbers(const T min_value, const T max_value,
	GpuMat<curandState> random_state);

	void CopyToDevice(const T* data, const size_t pitch);
	void CopyToHost(T* data, const size_t pitch) const;
	Mat<T> CopyToMat() const;

	// Transpose array by swapping x and y coordinates.
	void Transpose(GpuMat<T>* output);

	// Flip array along vertical axis.
	void FlipHorizontal(GpuMat<T>* output);

	// Rotate array in counter-clockwise direction.
	void Rotate(GpuMat<T>* output);

	void Read(const std::string& path);
	void Write(const std::string& path);
	void Write(const std::string& path, const size_t slice);

	protected:
	void ComputeCudaConfig();

	const static size_t kBlockDimX = 32;
	const static size_t kBlockDimY = 16;

	std::shared_ptr<T> array_;
	T* array_ptr_;

	size_t pitch_;
	size_t width_;
	size_t height_;
	size_t depth_;

	dim3 blockSize_;
	dim3 gridSize_;
	};

	////////////////////////////////////////////////////////////////////////////////
	// Implementation
	////////////////////////////////////////////////////////////////////////////////

	#ifdef __CUDACC__

	namespace internal {

	template <typename T>
	__global__ void FillWithScalarKernel(GpuMat<T> output, const T value) {
	const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
	const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row < output.GetHeight() && col < output.GetWidth()) {
	for (size_t slice = 0; slice < output.GetDepth(); ++slice) {
	output.Set(row, col, slice, value);
	}
	}
	}

	template <typename T>
	__global__ void FillWithVectorKernel(const T* values, GpuMat<T> output) {
	const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
	const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row < output.GetHeight() && col < output.GetWidth()) {
	for (size_t slice = 0; slice < output.GetDepth(); ++slice) {
	output.Set(row, col, slice, values[slice]);
	}
	}
	}

	template <typename T>
	__global__ void FillWithRandomNumbersKernel(GpuMat<T> output,
	GpuMat<curandState> random_state,
	const T min_value,
	const T max_value) {
	const size_t row = blockIdx.y * blockDim.y + threadIdx.y;
	const size_t col = blockIdx.x * blockDim.x + threadIdx.x;
	if (row < output.GetHeight() && col < output.GetWidth()) {
	curandState local_state = random_state.Get(row, col);
	for (size_t slice = 0; slice < output.GetDepth(); ++slice) {
	const T random_value =
	curand_uniform(&local_state) * (max_value - min_value) + min_value;
	output.Set(row, col, slice, random_value);
	}
	random_state.Set(row, col, local_state);
	}
	}

	} // namespace internal

	template <typename T>
	GpuMat<T>::GpuMat(const size_t width, const size_t height, const size_t depth)
	: array_(nullptr),
	array_ptr_(nullptr),
	width_(width),
	height_(height),
	depth_(depth) {
	CUDA_SAFE_CALL(cudaMallocPitch((void**)&array_ptr_, &pitch_,
	width_ * sizeof(T), height_ * depth_));

	array_ = std::shared_ptr<T>(array_ptr_, cudaFree);

	ComputeCudaConfig();
	}

	template <typename T>
	GpuMat<T>::~GpuMat() {
	array_.reset();
	array_ptr_ = nullptr;
	pitch_ = 0;
	width_ = 0;
	height_ = 0;
	depth_ = 0;
	}

	template <typename T>
	__host__ __device__ const T* GpuMat<T>::GetPtr() const {
	return array_ptr_;
	}

	template <typename T>
	__host__ __device__ T* GpuMat<T>::GetPtr() {
	return array_ptr_;
	}

	template <typename T>
	__host__ __device__ size_t GpuMat<T>::GetPitch() const {
	return pitch_;
	}

	template <typename T>
	__host__ __device__ size_t GpuMat<T>::GetWidth() const {
	return width_;
	}

	template <typename T>
	__host__ __device__ size_t GpuMat<T>::GetHeight() const {
	return height_;
	}

	template <typename T>
	__host__ __device__ size_t GpuMat<T>::GetDepth() const {
	return depth_;
	}

	template <typename T>
	__device__ T GpuMat<T>::Get(const size_t row, const size_t col,
	const size_t slice) const {
	return ((T)((char)array_ptr_ + pitch_ (slice * height_ + row)) + col);
	}

	template <typename T>
	__device__ void GpuMat<T>::GetSlice(const size_t row, const size_t col,
	T* values) const {
	for (size_t slice = 0; slice < depth_; ++slice) {
	values[slice] = Get(row, col, slice);
	}
	}

	template <typename T>
	__device__ T& GpuMat<T>::GetRef(const size_t row, const size_t col) {
	return GetRef(row, col, 0);
	}

	template <typename T>
	__device__ T& GpuMat<T>::GetRef(const size_t row, const size_t col,
	const size_t slice) {
	return ((T)((char)array_ptr_ + pitch_ (slice * height_ + row)) + col);
	}

	template <typename T>
	__device__ void GpuMat<T>::Set(const size_t row, const size_t col,
	const T value) {
	Set(row, col, 0, value);
	}

	template <typename T>
	__device__ void GpuMat<T>::Set(const size_t row, const size_t col,
	const size_t slice, const T value) {
	((T)((char)array_ptr_ + pitch_ (slice * height_ + row)) + col) = value;
	}

	template <typename T>
	__device__ void GpuMat<T>::SetSlice(const size_t row, const size_t col,
	const T* values) {
	for (size_t slice = 0; slice < depth_; ++slice) {
	Set(row, col, slice, values[slice]);
	}
	}

	template <typename T>
	void GpuMat<T>::FillWithScalar(const T value) {
	internal::FillWithScalarKernel<T><<<gridSize_, blockSize_>>>(*this, value);
	CUDA_SYNC_AND_CHECK();
	}

	template <typename T>
	void GpuMat<T>::FillWithVector(const T* values) {
	T* values_device;
	CUDA_SAFE_CALL(cudaMalloc((void*)&values_device, depth_ sizeof(T)));
	CUDA_SAFE_CALL(cudaMemcpy(values_device, values, depth_ * sizeof(T),
	cudaMemcpyHostToDevice));
	internal::FillWithVectorKernel<T>
	<<<gridSize_, blockSize_>>>(values_device, *this);
	CUDA_SYNC_AND_CHECK();
	CUDA_SAFE_CALL(cudaFree(values_device));
	}

	template <typename T>
	void GpuMat<T>::FillWithRandomNumbers(const T min_value, const T max_value,
	const GpuMat<curandState> random_state) {
	internal::FillWithRandomNumbersKernel<T>
	<<<gridSize_, blockSize_>>>(*this, random_state, min_value, max_value);
	CUDA_SYNC_AND_CHECK();
	}

	template <typename T>
	void GpuMat<T>::CopyToDevice(const T* data, const size_t pitch) {
	CUDA_SAFE_CALL(cudaMemcpy2D((void)array_ptr_, (size_t)pitch_, (void)data,
	pitch, width_ * sizeof(T), height_ * depth_,
	cudaMemcpyHostToDevice));
	}

	template <typename T>
	void GpuMat<T>::CopyToHost(T* data, const size_t pitch) const {
	CUDA_SAFE_CALL(cudaMemcpy2D((void)data, pitch, (void)array_ptr_,
	(size_t)pitch_, width_ * sizeof(T),
	height_ * depth_, cudaMemcpyDeviceToHost));
	}

	template <typename T>
	Mat<T> GpuMat<T>::CopyToMat() const {
	Mat<T> mat(width_, height_, depth_);
	CopyToHost(mat.GetPtr(), mat.GetWidth() * sizeof(T));
	return mat;
	}

	template <typename T>
	void GpuMat<T>::Transpose(GpuMat<T>* output) {
	for (size_t slice = 0; slice < depth_; ++slice) {
	CudaTranspose(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(),
	output->GetPtr() +
	slice * output->pitch_ / sizeof(T) * output->GetHeight(),
	width_, height_, pitch_, output->pitch_);
	}
	CUDA_SYNC_AND_CHECK();
	}

	template <typename T>
	void GpuMat<T>::FlipHorizontal(GpuMat<T>* output) {
	for (size_t slice = 0; slice < depth_; ++slice) {
	CudaFlipHorizontal(array_ptr_ + slice * pitch_ / sizeof(T) * GetHeight(),
	output->GetPtr() + slice * output->pitch_ / sizeof(T) *
	output->GetHeight(),
	width_, height_, pitch_, output->pitch_);
	}
	CUDA_SYNC_AND_CHECK();
	}

	template <typename T>
	void GpuMat<T>::Rotate(GpuMat<T>* output) {
	for (size_t slice = 0; slice < depth_; ++slice) {
	CudaRotate((T)((char)array_ptr_ + slice * pitch_ * GetHeight()),
	(T)((char)output->GetPtr() +
	slice * output->pitch_ * output->GetHeight()),
	width_, height_, pitch_, output->pitch_);
	}
	CUDA_SYNC_AND_CHECK();
	// This is equivalent to the following code:
	// GpuMat<T> flipped_array(width_, height_, GetDepth());
	// FlipHorizontal(&flipped_array);
	// flipped_array.Transpose(output);
	}

	template <typename T>
	void GpuMat<T>::Read(const std::string& path) {
	std::fstream text_file(path, std::ios::in \| std::ios::binary);
	CHECK(text_file.is_open()) << path;

	size_t width;
	size_t height;
	size_t depth;
	char unused_char;
	text_file >> width >> unused_char >> height >> unused_char >> depth >>
	unused_char;
	std::streampos pos = text_file.tellg();
	text_file.close();

	std::fstream binary_file(path, std::ios::in \| std::ios::binary);
	binary_file.seekg(pos);

	std::vector<T> source(width_ * height_ * depth_);
	ReadBinaryLittleEndian<T>(&binary_file, &source);
	binary_file.close();

	CopyToDevice(source.data(), width_ * sizeof(T));
	}

	template <typename T>
	void GpuMat<T>::Write(const std::string& path) {
	std::vector<T> dest(width_ * height_ * depth_);
	CopyToHost(dest.data(), width_ * sizeof(T));

	std::fstream text_file(path, std::ios::out);
	text_file << width_ << "&" << height_ << "&" << depth_ << "&";
	text_file.close();

	std::fstream binary_file(path,
	std::ios::out \| std::ios::binary \| std::ios::app);
	WriteBinaryLittleEndian<T>(&binary_file, dest);
	binary_file.close();
	}

	template <typename T>
	void GpuMat<T>::Write(const std::string& path, const size_t slice) {
	std::vector<T> dest(width_ * height_);
	CUDA_SAFE_CALL(cudaMemcpy2D(
	(void)dest.data(), width_ sizeof(T),
	(void)(array_ptr_ + slice height_ * pitch_ / sizeof(T)), pitch_,
	width_ * sizeof(T), height_, cudaMemcpyDeviceToHost));

	std::fstream text_file(path, std::ios::out);
	text_file << width_ << "&" << height_ << "&" << 1 << "&";
	text_file.close();

	std::fstream binary_file(path,
	std::ios::out \| std::ios::binary \| std::ios::app);
	WriteBinaryLittleEndian<T>(&binary_file, dest);
	binary_file.close();
	}

	template <typename T>
	void GpuMat<T>::ComputeCudaConfig() {
	blockSize_.x = kBlockDimX;
	blockSize_.y = kBlockDimY;
	blockSize_.z = 1;

	gridSize_.x = (width_ - 1) / kBlockDimX + 1;
	gridSize_.y = (height_ - 1) / kBlockDimY + 1;
	gridSize_.z = 1;
	}

	#endif // __CUDACC__

	} // namespace mvs
	} // namespace colmap

	#endif // COLMAP_SRC_MVS_GPU_MAT_H_