// Copyright (c) 2022, ETH Zurich and UNC Chapel Hill. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // * Neither the name of ETH Zurich and UNC Chapel Hill nor the names of // its contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: Johannes L. Schoenberger (jsch-at-demuc-dot-de) #define _USE_MATH_DEFINES #include "mvs/patch_match_cuda.h" #include #include #include #include #include #include "util/cuda.h" #include "util/cudacc.h" #include "util/logging.h" // The number of threads per Cuda thread. Warning: Do not change this value, // since the templated window sizes rely on this value. #define THREADS_PER_BLOCK 32 // We must not include "util/math.h" to avoid any Eigen includes here, // since Visual Studio cannot compile some of the Eigen/Boost expressions. #ifndef DEG2RAD #define DEG2RAD(deg) deg * 0.0174532925199432 #endif namespace colmap { namespace mvs { texture ref_image_texture; texture src_images_texture; texture src_depth_maps_texture; texture poses_texture; // Calibration of reference image as {fx, cx, fy, cy}. __constant__ float ref_K[4]; // Calibration of reference image as {1/fx, -cx/fx, 1/fy, -cy/fy}. __constant__ float ref_inv_K[4]; __device__ inline void Mat33DotVec3(const float mat[9], const float vec[3], float result[3]) { result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2]; result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2]; result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2]; } __device__ inline void Mat33DotVec3Homogeneous(const float mat[9], const float vec[2], float result[2]) { const float inv_z = 1.0f / (mat[6] * vec[0] + mat[7] * vec[1] + mat[8]); result[0] = inv_z * (mat[0] * vec[0] + mat[1] * vec[1] + mat[2]); result[1] = inv_z * (mat[3] * vec[0] + mat[4] * vec[1] + mat[5]); } __device__ inline float DotProduct3(const float vec1[3], const float vec2[3]) { return vec1[0] * vec2[0] + vec1[1] * vec2[1] + vec1[2] * vec2[2]; } __device__ inline float GenerateRandomDepth(const float depth_min, const float depth_max, curandState* rand_state) { return curand_uniform(rand_state) * (depth_max - depth_min) + depth_min; } __device__ inline void GenerateRandomNormal(const int row, const int col, curandState* rand_state, float normal[3]) { // Unbiased sampling of normal, according to George Marsaglia, "Choosing a // Point from the Surface of a Sphere", 1972. float v1 = 0.0f; float v2 = 0.0f; float s = 2.0f; while (s >= 1.0f) { v1 = 2.0f * curand_uniform(rand_state) - 1.0f; v2 = 2.0f * curand_uniform(rand_state) - 1.0f; s = v1 * v1 + v2 * v2; } const float s_norm = sqrt(1.0f - s); normal[0] = 2.0f * v1 * s_norm; normal[1] = 2.0f * v2 * s_norm; normal[2] = 1.0f - 2.0f * s; // Make sure normal is looking away from camera. const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1], ref_inv_K[2] * row + ref_inv_K[3], 1.0f}; if (DotProduct3(normal, view_ray) > 0) { normal[0] = -normal[0]; normal[1] = -normal[1]; normal[2] = -normal[2]; } } __device__ inline float PerturbDepth(const float perturbation, const float depth, curandState* rand_state) { const float depth_min = (1.0f - perturbation) * depth; const float depth_max = (1.0f + perturbation) * depth; return GenerateRandomDepth(depth_min, depth_max, rand_state); } __device__ inline void PerturbNormal(const int row, const int col, const float perturbation, const float normal[3], curandState* rand_state, float perturbed_normal[3], const int num_trials = 0) { // Perturbation rotation angles. const float a1 = (curand_uniform(rand_state) - 0.5f) * perturbation; const float a2 = (curand_uniform(rand_state) - 0.5f) * perturbation; const float a3 = (curand_uniform(rand_state) - 0.5f) * perturbation; const float sin_a1 = sin(a1); const float sin_a2 = sin(a2); const float sin_a3 = sin(a3); const float cos_a1 = cos(a1); const float cos_a2 = cos(a2); const float cos_a3 = cos(a3); // R = Rx * Ry * Rz float R[9]; R[0] = cos_a2 * cos_a3; R[1] = -cos_a2 * sin_a3; R[2] = sin_a2; R[3] = cos_a1 * sin_a3 + cos_a3 * sin_a1 * sin_a2; R[4] = cos_a1 * cos_a3 - sin_a1 * sin_a2 * sin_a3; R[5] = -cos_a2 * sin_a1; R[6] = sin_a1 * sin_a3 - cos_a1 * cos_a3 * sin_a2; R[7] = cos_a3 * sin_a1 + cos_a1 * sin_a2 * sin_a3; R[8] = cos_a1 * cos_a2; // Perturb the normal vector. Mat33DotVec3(R, normal, perturbed_normal); // Make sure the perturbed normal is still looking in the same direction as // the viewing direction, otherwise try again but with smaller perturbation. const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1], ref_inv_K[2] * row + ref_inv_K[3], 1.0f}; if (DotProduct3(perturbed_normal, view_ray) >= 0.0f) { const int kMaxNumTrials = 3; if (num_trials < kMaxNumTrials) { PerturbNormal(row, col, 0.5f * perturbation, normal, rand_state, perturbed_normal, num_trials + 1); return; } else { perturbed_normal[0] = normal[0]; perturbed_normal[1] = normal[1]; perturbed_normal[2] = normal[2]; return; } } // Make sure normal has unit norm. const float inv_norm = rsqrt(DotProduct3(perturbed_normal, perturbed_normal)); perturbed_normal[0] *= inv_norm; perturbed_normal[1] *= inv_norm; perturbed_normal[2] *= inv_norm; } __device__ inline void ComputePointAtDepth(const float row, const float col, const float depth, float point[3]) { point[0] = depth * (ref_inv_K[0] * col + ref_inv_K[1]); point[1] = depth * (ref_inv_K[2] * row + ref_inv_K[3]); point[2] = depth; } // Transfer depth on plane from viewing ray at row1 to row2. The returned // depth is the intersection of the viewing ray through row2 with the plane // at row1 defined by the given depth and normal. __device__ inline float PropagateDepth(const float depth1, const float normal1[3], const float row1, const float row2) { // Point along first viewing ray. const float x1 = depth1 * (ref_inv_K[2] * row1 + ref_inv_K[3]); const float y1 = depth1; // Point on plane defined by point along first viewing ray and plane normal1. const float x2 = x1 + normal1[2]; const float y2 = y1 - normal1[1]; // Origin of second viewing ray. // const float x3 = 0.0f; // const float y3 = 0.0f; // Point on second viewing ray. const float x4 = ref_inv_K[2] * row2 + ref_inv_K[3]; // const float y4 = 1.0f; // Intersection of the lines ((x1, y1), (x2, y2)) and ((x3, y3), (x4, y4)). const float denom = x2 - x1 + x4 * (y1 - y2); constexpr float kEps = 1e-5f; if (abs(denom) < kEps) { return depth1; } const float nom = y1 * x2 - x1 * y2; return nom / denom; } // First, compute triangulation angle between reference and source image for 3D // point. Second, compute incident angle between viewing direction of source // image and normal direction of 3D point. Both angles are cosine distances. __device__ inline void ComputeViewingAngles(const float point[3], const float normal[3], const int image_idx, float* cos_triangulation_angle, float* cos_incident_angle) { *cos_triangulation_angle = 0.0f; *cos_incident_angle = 0.0f; // Projection center of source image. float C[3]; for (int i = 0; i < 3; ++i) { C[i] = tex2D(poses_texture, i + 16, image_idx); } // Ray from point to camera. const float SX[3] = {C[0] - point[0], C[1] - point[1], C[2] - point[2]}; // Length of ray from reference image to point. const float RX_inv_norm = rsqrt(DotProduct3(point, point)); // Length of ray from source image to point. const float SX_inv_norm = rsqrt(DotProduct3(SX, SX)); *cos_incident_angle = DotProduct3(SX, normal) * SX_inv_norm; *cos_triangulation_angle = DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm; } __device__ inline void ComposeHomography(const int image_idx, const int row, const int col, const float depth, const float normal[3], float H[9]) { // Calibration of source image. float K[4]; for (int i = 0; i < 4; ++i) { K[i] = tex2D(poses_texture, i, image_idx); } // Relative rotation between reference and source image. float R[9]; for (int i = 0; i < 9; ++i) { R[i] = tex2D(poses_texture, i + 4, image_idx); } // Relative translation between reference and source image. float T[3]; for (int i = 0; i < 3; ++i) { T[i] = tex2D(poses_texture, i + 13, image_idx); } // Distance to the plane. const float dist = depth * (normal[0] * (ref_inv_K[0] * col + ref_inv_K[1]) + normal[1] * (ref_inv_K[2] * row + ref_inv_K[3]) + normal[2]); const float inv_dist = 1.0f / dist; const float inv_dist_N0 = inv_dist * normal[0]; const float inv_dist_N1 = inv_dist * normal[1]; const float inv_dist_N2 = inv_dist * normal[2]; // Homography as H = K * (R - T * n' / d) * Kref^-1. H[0] = ref_inv_K[0] * (K[0] * (R[0] + inv_dist_N0 * T[0]) + K[1] * (R[6] + inv_dist_N0 * T[2])); H[1] = ref_inv_K[2] * (K[0] * (R[1] + inv_dist_N1 * T[0]) + K[1] * (R[7] + inv_dist_N1 * T[2])); H[2] = K[0] * (R[2] + inv_dist_N2 * T[0]) + K[1] * (R[8] + inv_dist_N2 * T[2]) + ref_inv_K[1] * (K[0] * (R[0] + inv_dist_N0 * T[0]) + K[1] * (R[6] + inv_dist_N0 * T[2])) + ref_inv_K[3] * (K[0] * (R[1] + inv_dist_N1 * T[0]) + K[1] * (R[7] + inv_dist_N1 * T[2])); H[3] = ref_inv_K[0] * (K[2] * (R[3] + inv_dist_N0 * T[1]) + K[3] * (R[6] + inv_dist_N0 * T[2])); H[4] = ref_inv_K[2] * (K[2] * (R[4] + inv_dist_N1 * T[1]) + K[3] * (R[7] + inv_dist_N1 * T[2])); H[5] = K[2] * (R[5] + inv_dist_N2 * T[1]) + K[3] * (R[8] + inv_dist_N2 * T[2]) + ref_inv_K[1] * (K[2] * (R[3] + inv_dist_N0 * T[1]) + K[3] * (R[6] + inv_dist_N0 * T[2])) + ref_inv_K[3] * (K[2] * (R[4] + inv_dist_N1 * T[1]) + K[3] * (R[7] + inv_dist_N1 * T[2])); H[6] = ref_inv_K[0] * (R[6] + inv_dist_N0 * T[2]); H[7] = ref_inv_K[2] * (R[7] + inv_dist_N1 * T[2]); H[8] = R[8] + ref_inv_K[1] * (R[6] + inv_dist_N0 * T[2]) + ref_inv_K[3] * (R[7] + inv_dist_N1 * T[2]) + inv_dist_N2 * T[2]; } // Each thread in the current warp / thread block reads in 3 columns of the // reference image. The shared memory holds 3 * THREADS_PER_BLOCK columns and // kWindowSize rows of the reference image. Each thread copies every // THREADS_PER_BLOCK-th column from global to shared memory offset by its ID. // For example, if THREADS_PER_BLOCK = 32, then thread 0 reads columns 0, 32, 64 // and thread 1 columns 1, 33, 65. When computing the photoconsistency, which is // shared among each thread block, each thread can then read the reference image // colors from shared memory. Note that this limits the window radius to a // maximum of THREADS_PER_BLOCK. template struct LocalRefImage { const static int kWindowRadius = kWindowSize / 2; const static int kThreadBlockRadius = 1; const static int kThreadBlockSize = 2 * kThreadBlockRadius + 1; const static int kNumRows = kWindowSize; const static int kNumColumns = kThreadBlockSize * THREADS_PER_BLOCK; const static int kDataSize = kNumRows * kNumColumns; float* data = nullptr; __device__ inline void Read(const int row) { // For the first row, read the entire block into shared memory. For all // consecutive rows, it is only necessary to shift the rows in shared memory // up by one element and then read in a new row at the bottom of the shared // memory. Note that this assumes that the calling loop starts with the // first row and then consecutively reads in the next row. const int thread_id = threadIdx.x; const int thread_block_first_id = blockDim.x * blockIdx.x; const int local_col_start = thread_id; const int global_col_start = thread_block_first_id - kThreadBlockRadius * THREADS_PER_BLOCK + thread_id; if (row == 0) { int global_row = row - kWindowRadius; for (int local_row = 0; local_row < kNumRows; ++local_row, ++global_row) { int local_col = local_col_start; int global_col = global_col_start; #pragma unroll for (int block = 0; block < kThreadBlockSize; ++block) { data[local_row * kNumColumns + local_col] = tex2D(ref_image_texture, global_col, global_row); local_col += THREADS_PER_BLOCK; global_col += THREADS_PER_BLOCK; } } } else { // Move rows in shared memory up by one row. for (int local_row = 1; local_row < kNumRows; ++local_row) { int local_col = local_col_start; #pragma unroll for (int block = 0; block < kThreadBlockSize; ++block) { data[(local_row - 1) * kNumColumns + local_col] = data[local_row * kNumColumns + local_col]; local_col += THREADS_PER_BLOCK; } } // Read next row into the last row of shared memory. const int local_row = kNumRows - 1; const int global_row = row + kWindowRadius; int local_col = local_col_start; int global_col = global_col_start; #pragma unroll for (int block = 0; block < kThreadBlockSize; ++block) { data[local_row * kNumColumns + local_col] = tex2D(ref_image_texture, global_col, global_row); local_col += THREADS_PER_BLOCK; global_col += THREADS_PER_BLOCK; } } } }; // The return values is 1 - NCC, so the range is [0, 2], the smaller the // value, the better the color consistency. template struct PhotoConsistencyCostComputer { const static int kWindowRadius = kWindowSize / 2; __device__ PhotoConsistencyCostComputer(const float sigma_spatial, const float sigma_color) : bilateral_weight_computer_(sigma_spatial, sigma_color) {} // Maximum photo consistency cost as 1 - min(NCC). const float kMaxCost = 2.0f; // Thread warp local reference image data around current patch. typedef LocalRefImage LocalRefImageType; LocalRefImageType local_ref_image; // Precomputed sum of raw and squared image intensities. float local_ref_sum = 0.0f; float local_ref_squared_sum = 0.0f; // Index of source image. int src_image_idx = -1; // Center position of patch in reference image. int row = -1; int col = -1; // Depth and normal for which to warp patch. float depth = 0.0f; const float* normal = nullptr; __device__ inline void Read(const int row) { local_ref_image.Read(row); __syncthreads(); } __device__ inline float Compute() const { float tform[9]; ComposeHomography(src_image_idx, row, col, depth, normal, tform); float tform_step[8]; for (int i = 0; i < 8; ++i) { tform_step[i] = kWindowStep * tform[i]; } const int thread_id = threadIdx.x; const int row_start = row - kWindowRadius; const int col_start = col - kWindowRadius; float col_src = tform[0] * col_start + tform[1] * row_start + tform[2]; float row_src = tform[3] * col_start + tform[4] * row_start + tform[5]; float z = tform[6] * col_start + tform[7] * row_start + tform[8]; float base_col_src = col_src; float base_row_src = row_src; float base_z = z; int ref_image_idx = THREADS_PER_BLOCK - kWindowRadius + thread_id; int ref_image_base_idx = ref_image_idx; const float ref_center_color = local_ref_image .data[ref_image_idx + kWindowRadius * 3 * THREADS_PER_BLOCK + kWindowRadius]; const float ref_color_sum = local_ref_sum; const float ref_color_squared_sum = local_ref_squared_sum; float src_color_sum = 0.0f; float src_color_squared_sum = 0.0f; float src_ref_color_sum = 0.0f; float bilateral_weight_sum = 0.0f; for (int row = -kWindowRadius; row <= kWindowRadius; row += kWindowStep) { for (int col = -kWindowRadius; col <= kWindowRadius; col += kWindowStep) { const float inv_z = 1.0f / z; const float norm_col_src = inv_z * col_src + 0.5f; const float norm_row_src = inv_z * row_src + 0.5f; const float ref_color = local_ref_image.data[ref_image_idx]; const float src_color = tex2DLayered(src_images_texture, norm_col_src, norm_row_src, src_image_idx); const float bilateral_weight = bilateral_weight_computer_.Compute( row, col, ref_center_color, ref_color); const float bilateral_weight_src = bilateral_weight * src_color; src_color_sum += bilateral_weight_src; src_color_squared_sum += bilateral_weight_src * src_color; src_ref_color_sum += bilateral_weight_src * ref_color; bilateral_weight_sum += bilateral_weight; ref_image_idx += kWindowStep; // Accumulate warped source coordinates per row to reduce numerical // errors. Note that this is necessary since coordinates usually are in // the order of 1000s as opposed to the color values which are // normalized to the range [0, 1]. col_src += tform_step[0]; row_src += tform_step[3]; z += tform_step[6]; } ref_image_base_idx += kWindowStep * 3 * THREADS_PER_BLOCK; ref_image_idx = ref_image_base_idx; base_col_src += tform_step[1]; base_row_src += tform_step[4]; base_z += tform_step[7]; col_src = base_col_src; row_src = base_row_src; z = base_z; } const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum; src_color_sum *= inv_bilateral_weight_sum; src_color_squared_sum *= inv_bilateral_weight_sum; src_ref_color_sum *= inv_bilateral_weight_sum; const float ref_color_var = ref_color_squared_sum - ref_color_sum * ref_color_sum; const float src_color_var = src_color_squared_sum - src_color_sum * src_color_sum; // Based on Jensen's Inequality for convex functions, the variance // should always be larger than 0. Do not make this threshold smaller. constexpr float kMinVar = 1e-5f; if (ref_color_var < kMinVar || src_color_var < kMinVar) { return kMaxCost; } else { const float src_ref_color_covar = src_ref_color_sum - ref_color_sum * src_color_sum; const float src_ref_color_var = sqrt(ref_color_var * src_color_var); return max(0.0f, min(kMaxCost, 1.0f - src_ref_color_covar / src_ref_color_var)); } } private: const BilateralWeightComputer bilateral_weight_computer_; }; __device__ inline float ComputeGeomConsistencyCost(const float row, const float col, const float depth, const int image_idx, const float max_cost) { // Extract projection matrices for source image. float P[12]; for (int i = 0; i < 12; ++i) { P[i] = tex2D(poses_texture, i + 19, image_idx); } float inv_P[12]; for (int i = 0; i < 12; ++i) { inv_P[i] = tex2D(poses_texture, i + 31, image_idx); } // Project point in reference image to world. float forward_point[3]; ComputePointAtDepth(row, col, depth, forward_point); // Project world point to source image. const float inv_forward_z = 1.0f / (P[8] * forward_point[0] + P[9] * forward_point[1] + P[10] * forward_point[2] + P[11]); float src_col = inv_forward_z * (P[0] * forward_point[0] + P[1] * forward_point[1] + P[2] * forward_point[2] + P[3]); float src_row = inv_forward_z * (P[4] * forward_point[0] + P[5] * forward_point[1] + P[6] * forward_point[2] + P[7]); // Extract depth in source image. const float src_depth = tex2DLayered(src_depth_maps_texture, src_col + 0.5f, src_row + 0.5f, image_idx); // Projection outside of source image. if (src_depth == 0.0f) { return max_cost; } // Project point in source image to world. src_col *= src_depth; src_row *= src_depth; const float backward_point_x = inv_P[0] * src_col + inv_P[1] * src_row + inv_P[2] * src_depth + inv_P[3]; const float backward_point_y = inv_P[4] * src_col + inv_P[5] * src_row + inv_P[6] * src_depth + inv_P[7]; const float backward_point_z = inv_P[8] * src_col + inv_P[9] * src_row + inv_P[10] * src_depth + inv_P[11]; const float inv_backward_point_z = 1.0f / backward_point_z; // Project world point back to reference image. const float backward_col = inv_backward_point_z * (ref_K[0] * backward_point_x + ref_K[1] * backward_point_z); const float backward_row = inv_backward_point_z * (ref_K[2] * backward_point_y + ref_K[3] * backward_point_z); // Return truncated reprojection error between original observation and // the forward-backward projected observation. const float diff_col = col - backward_col; const float diff_row = row - backward_row; return min(max_cost, sqrt(diff_col * diff_col + diff_row * diff_row)); } // Find index of minimum in given values. template __device__ inline int FindMinCost(const float costs[kNumCosts]) { float min_cost = costs[0]; int min_cost_idx = 0; for (int idx = 1; idx < kNumCosts; ++idx) { if (costs[idx] <= min_cost) { min_cost = costs[idx]; min_cost_idx = idx; } } return min_cost_idx; } __device__ inline void TransformPDFToCDF(float* probs, const int num_probs) { float prob_sum = 0.0f; for (int i = 0; i < num_probs; ++i) { prob_sum += probs[i]; } const float inv_prob_sum = 1.0f / prob_sum; float cum_prob = 0.0f; for (int i = 0; i < num_probs; ++i) { const float prob = probs[i] * inv_prob_sum; cum_prob += prob; probs[i] = cum_prob; } } class LikelihoodComputer { public: __device__ LikelihoodComputer(const float ncc_sigma, const float min_triangulation_angle, const float incident_angle_sigma) : cos_min_triangulation_angle_(cos(min_triangulation_angle)), inv_incident_angle_sigma_square_( -0.5f / (incident_angle_sigma * incident_angle_sigma)), inv_ncc_sigma_square_(-0.5f / (ncc_sigma * ncc_sigma)), ncc_norm_factor_(ComputeNCCCostNormFactor(ncc_sigma)) {} // Compute forward message from current cost and forward message of // previous / neighboring pixel. __device__ float ComputeForwardMessage(const float cost, const float prev) const { return ComputeMessage(cost, prev); } // Compute backward message from current cost and backward message of // previous / neighboring pixel. __device__ float ComputeBackwardMessage(const float cost, const float prev) const { return ComputeMessage(cost, prev); } // Compute the selection probability from the forward and backward message. __device__ inline float ComputeSelProb(const float alpha, const float beta, const float prev, const float prev_weight) const { const float zn0 = (1.0f - alpha) * (1.0f - beta); const float zn1 = alpha * beta; const float curr = zn1 / (zn0 + zn1); return prev_weight * prev + (1.0f - prev_weight) * curr; } // Compute NCC probability. Note that cost = 1 - NCC. __device__ inline float ComputeNCCProb(const float cost) const { return exp(cost * cost * inv_ncc_sigma_square_) * ncc_norm_factor_; } // Compute the triangulation angle probability. __device__ inline float ComputeTriProb( const float cos_triangulation_angle) const { const float abs_cos_triangulation_angle = abs(cos_triangulation_angle); if (abs_cos_triangulation_angle > cos_min_triangulation_angle_) { const float scaled = 1.0f - (1.0f - abs_cos_triangulation_angle) / (1.0f - cos_min_triangulation_angle_); const float likelihood = 1.0f - scaled * scaled; return min(1.0f, max(0.0f, likelihood)); } else { return 1.0f; } } // Compute the incident angle probability. __device__ inline float ComputeIncProb(const float cos_incident_angle) const { const float x = 1.0f - max(0.0f, cos_incident_angle); return exp(x * x * inv_incident_angle_sigma_square_); } // Compute the warping/resolution prior probability. template __device__ inline float ComputeResolutionProb(const float H[9], const float row, const float col) const { const int kWindowRadius = kWindowSize / 2; // Warp corners of patch in reference image to source image. float src1[2]; const float ref1[2] = {col - kWindowRadius, row - kWindowRadius}; Mat33DotVec3Homogeneous(H, ref1, src1); float src2[2]; const float ref2[2] = {col - kWindowRadius, row + kWindowRadius}; Mat33DotVec3Homogeneous(H, ref2, src2); float src3[2]; const float ref3[2] = {col + kWindowRadius, row + kWindowRadius}; Mat33DotVec3Homogeneous(H, ref3, src3); float src4[2]; const float ref4[2] = {col + kWindowRadius, row - kWindowRadius}; Mat33DotVec3Homogeneous(H, ref4, src4); // Compute area of patches in reference and source image. const float ref_area = kWindowSize * kWindowSize; const float src_area = abs(0.5f * (src1[0] * src2[1] - src2[0] * src1[1] - src1[0] * src4[1] + src2[0] * src3[1] - src3[0] * src2[1] + src4[0] * src1[1] + src3[0] * src4[1] - src4[0] * src3[1])); if (ref_area > src_area) { return src_area / ref_area; } else { return ref_area / src_area; } } private: // The normalization for the likelihood function, i.e. the normalization for // the prior on the matching cost. __device__ static inline float ComputeNCCCostNormFactor( const float ncc_sigma) { // A = sqrt(2pi)*sigma/2*erf(sqrt(2)/sigma) // erf(x) = 2/sqrt(pi) * integral from 0 to x of exp(-t^2) dt return 2.0f / (sqrt(2.0f * M_PI) * ncc_sigma * erff(2.0f / (ncc_sigma * 1.414213562f))); } // Compute the forward or backward message. template __device__ inline float ComputeMessage(const float cost, const float prev) const { constexpr float kUniformProb = 0.5f; constexpr float kNoChangeProb = 0.99999f; const float kChangeProb = 1.0f - kNoChangeProb; const float emission = ComputeNCCProb(cost); float zn0; // Message for selection probability = 0. float zn1; // Message for selection probability = 1. if (kForward) { zn0 = (prev * kChangeProb + (1.0f - prev) * kNoChangeProb) * kUniformProb; zn1 = (prev * kNoChangeProb + (1.0f - prev) * kChangeProb) * emission; } else { zn0 = prev * emission * kChangeProb + (1.0f - prev) * kUniformProb * kNoChangeProb; zn1 = prev * emission * kNoChangeProb + (1.0f - prev) * kUniformProb * kChangeProb; } return zn1 / (zn0 + zn1); } float cos_min_triangulation_angle_; float inv_incident_angle_sigma_square_; float inv_ncc_sigma_square_; float ncc_norm_factor_; }; // Rotate normals by 90deg around z-axis in counter-clockwise direction. __global__ void InitNormalMap(GpuMat normal_map, GpuMat rand_state_map) { const int row = blockDim.y * blockIdx.y + threadIdx.y; const int col = blockDim.x * blockIdx.x + threadIdx.x; if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) { curandState rand_state = rand_state_map.Get(row, col); float normal[3]; GenerateRandomNormal(row, col, &rand_state, normal); normal_map.SetSlice(row, col, normal); rand_state_map.Set(row, col, rand_state); } } // Rotate normals by 90deg around z-axis in counter-clockwise direction. __global__ void RotateNormalMap(GpuMat normal_map) { const int row = blockDim.y * blockIdx.y + threadIdx.y; const int col = blockDim.x * blockIdx.x + threadIdx.x; if (col < normal_map.GetWidth() && row < normal_map.GetHeight()) { float normal[3]; normal_map.GetSlice(row, col, normal); float rotated_normal[3]; rotated_normal[0] = normal[1]; rotated_normal[1] = -normal[0]; rotated_normal[2] = normal[2]; normal_map.SetSlice(row, col, rotated_normal); } } template __global__ void ComputeInitialCost(GpuMat cost_map, const GpuMat depth_map, const GpuMat normal_map, const GpuMat ref_sum_image, const GpuMat ref_squared_sum_image, const float sigma_spatial, const float sigma_color) { const int col = blockDim.x * blockIdx.x + threadIdx.x; typedef PhotoConsistencyCostComputer PhotoConsistencyCostComputerType; PhotoConsistencyCostComputerType pcc_computer(sigma_spatial, sigma_color); pcc_computer.col = col; __shared__ float local_ref_image_data [PhotoConsistencyCostComputerType::LocalRefImageType::kDataSize]; pcc_computer.local_ref_image.data = &local_ref_image_data[0]; float normal[3] = {0}; pcc_computer.normal = normal; for (int row = 0; row < cost_map.GetHeight(); ++row) { // Note that this must be executed even for pixels outside the borders, // since pixels are used in the local neighborhood of the current pixel. pcc_computer.Read(row); if (col < cost_map.GetWidth()) { pcc_computer.depth = depth_map.Get(row, col); normal_map.GetSlice(row, col, normal); pcc_computer.row = row; pcc_computer.local_ref_sum = ref_sum_image.Get(row, col); pcc_computer.local_ref_squared_sum = ref_squared_sum_image.Get(row, col); for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { pcc_computer.src_image_idx = image_idx; cost_map.Set(row, col, image_idx, pcc_computer.Compute()); } } } } struct SweepOptions { float perturbation = 1.0f; float depth_min = 0.0f; float depth_max = 1.0f; int num_samples = 15; float sigma_spatial = 3.0f; float sigma_color = 0.3f; float ncc_sigma = 0.6f; float min_triangulation_angle = 0.5f; float incident_angle_sigma = 0.9f; float prev_sel_prob_weight = 0.0f; float geom_consistency_regularizer = 0.1f; float geom_consistency_max_cost = 5.0f; float filter_min_ncc = 0.1f; float filter_min_triangulation_angle = 3.0f; int filter_min_num_consistent = 2; float filter_geom_consistency_max_cost = 1.0f; }; template __global__ void SweepFromTopToBottom( GpuMat global_workspace, GpuMat rand_state_map, GpuMat cost_map, GpuMat depth_map, GpuMat normal_map, GpuMat consistency_mask, GpuMat sel_prob_map, const GpuMat prev_sel_prob_map, const GpuMat ref_sum_image, const GpuMat ref_squared_sum_image, const SweepOptions options) { const int col = blockDim.x * blockIdx.x + threadIdx.x; // Probability for boundary pixels. constexpr float kUniformProb = 0.5f; LikelihoodComputer likelihood_computer(options.ncc_sigma, options.min_triangulation_angle, options.incident_angle_sigma); float* forward_message = &global_workspace.GetPtr()[col * global_workspace.GetHeight()]; float* sampling_probs = &global_workspace.GetPtr()[global_workspace.GetWidth() * global_workspace.GetHeight() + col * global_workspace.GetHeight()]; ////////////////////////////////////////////////////////////////////////////// // Compute backward message for all rows. Note that the backward messages are // temporarily stored in the sel_prob_map and replaced row by row as the // updated forward messages are computed further below. ////////////////////////////////////////////////////////////////////////////// if (col < cost_map.GetWidth()) { for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { // Compute backward message. float beta = kUniformProb; for (int row = cost_map.GetHeight() - 1; row >= 0; --row) { const float cost = cost_map.Get(row, col, image_idx); beta = likelihood_computer.ComputeBackwardMessage(cost, beta); sel_prob_map.Set(row, col, image_idx, beta); } // Initialize forward message. forward_message[image_idx] = kUniformProb; } } ////////////////////////////////////////////////////////////////////////////// // Estimate parameters for remaining rows and compute selection probabilities. ////////////////////////////////////////////////////////////////////////////// typedef PhotoConsistencyCostComputer PhotoConsistencyCostComputerType; PhotoConsistencyCostComputerType pcc_computer(options.sigma_spatial, options.sigma_color); pcc_computer.col = col; __shared__ float local_ref_image_data [PhotoConsistencyCostComputerType::LocalRefImageType::kDataSize]; pcc_computer.local_ref_image.data = &local_ref_image_data[0]; struct ParamState { float depth = 0.0f; float normal[3] = {0}; }; // Parameters of previous pixel in column. ParamState prev_param_state; // Parameters of current pixel in column. ParamState curr_param_state; // Randomly sampled parameters. ParamState rand_param_state; // Cuda PRNG state for random sampling. curandState rand_state; if (col < cost_map.GetWidth()) { // Read random state for current column. rand_state = rand_state_map.Get(0, col); // Parameters for first row in column. prev_param_state.depth = depth_map.Get(0, col); normal_map.GetSlice(0, col, prev_param_state.normal); } for (int row = 0; row < cost_map.GetHeight(); ++row) { // Note that this must be executed even for pixels outside the borders, // since pixels are used in the local neighborhood of the current pixel. pcc_computer.Read(row); if (col >= cost_map.GetWidth()) { continue; } pcc_computer.row = row; pcc_computer.local_ref_sum = ref_sum_image.Get(row, col); pcc_computer.local_ref_squared_sum = ref_squared_sum_image.Get(row, col); // Propagate the depth at which the current ray intersects with the plane // of the normal of the previous ray. This helps to better estimate // the depth of very oblique structures, i.e. pixels whose normal direction // is significantly different from their viewing direction. prev_param_state.depth = PropagateDepth( prev_param_state.depth, prev_param_state.normal, row - 1, row); // Read parameters for current pixel from previous sweep. curr_param_state.depth = depth_map.Get(row, col); normal_map.GetSlice(row, col, curr_param_state.normal); // Generate random parameters. rand_param_state.depth = PerturbDepth(options.perturbation, curr_param_state.depth, &rand_state); PerturbNormal(row, col, options.perturbation * M_PI, curr_param_state.normal, &rand_state, rand_param_state.normal); // Read in the backward message, compute selection probabilities and // modulate selection probabilities with priors. float point[3]; ComputePointAtDepth(row, col, curr_param_state.depth, point); for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { const float cost = cost_map.Get(row, col, image_idx); const float alpha = likelihood_computer.ComputeForwardMessage( cost, forward_message[image_idx]); const float beta = sel_prob_map.Get(row, col, image_idx); const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx); const float sel_prob = likelihood_computer.ComputeSelProb( alpha, beta, prev_prob, options.prev_sel_prob_weight); float cos_triangulation_angle; float cos_incident_angle; ComputeViewingAngles(point, curr_param_state.normal, image_idx, &cos_triangulation_angle, &cos_incident_angle); const float tri_prob = likelihood_computer.ComputeTriProb(cos_triangulation_angle); const float inc_prob = likelihood_computer.ComputeIncProb(cos_incident_angle); float H[9]; ComposeHomography(image_idx, row, col, curr_param_state.depth, curr_param_state.normal, H); const float res_prob = likelihood_computer.ComputeResolutionProb(H, row, col); sampling_probs[image_idx] = sel_prob * tri_prob * inc_prob * res_prob; } TransformPDFToCDF(sampling_probs, cost_map.GetDepth()); // Compute matching cost using Monte Carlo sampling of source images. Images // with higher selection probability are more likely to be sampled. Hence, // if only very few source images see the reference image pixel, the same // source image is likely to be sampled many times. Instead of taking // the best K probabilities, this sampling scheme has the advantage of // being adaptive to any distribution of selection probabilities. constexpr int kNumCosts = 5; float costs[kNumCosts] = {0}; const float depths[kNumCosts] = { curr_param_state.depth, prev_param_state.depth, rand_param_state.depth, curr_param_state.depth, rand_param_state.depth}; const float* normals[kNumCosts] = { curr_param_state.normal, prev_param_state.normal, rand_param_state.normal, rand_param_state.normal, curr_param_state.normal}; for (int sample = 0; sample < options.num_samples; ++sample) { const float rand_prob = curand_uniform(&rand_state) - FLT_EPSILON; pcc_computer.src_image_idx = -1; for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { const float prob = sampling_probs[image_idx]; if (prob > rand_prob) { pcc_computer.src_image_idx = image_idx; break; } } if (pcc_computer.src_image_idx == -1) { continue; } costs[0] += cost_map.Get(row, col, pcc_computer.src_image_idx); if (kGeomConsistencyTerm) { costs[0] += options.geom_consistency_regularizer * ComputeGeomConsistencyCost( row, col, depths[0], pcc_computer.src_image_idx, options.geom_consistency_max_cost); } for (int i = 1; i < kNumCosts; ++i) { pcc_computer.depth = depths[i]; pcc_computer.normal = normals[i]; costs[i] += pcc_computer.Compute(); if (kGeomConsistencyTerm) { costs[i] += options.geom_consistency_regularizer * ComputeGeomConsistencyCost( row, col, depths[i], pcc_computer.src_image_idx, options.geom_consistency_max_cost); } } } // Find the parameters of the minimum cost. const int min_cost_idx = FindMinCost(costs); const float best_depth = depths[min_cost_idx]; const float* best_normal = normals[min_cost_idx]; // Save best new parameters. depth_map.Set(row, col, best_depth); normal_map.SetSlice(row, col, best_normal); // Use the new cost to recompute the updated forward message and // the selection probability. pcc_computer.depth = best_depth; pcc_computer.normal = best_normal; for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { // Determine the cost for best depth. float cost; if (min_cost_idx == 0) { cost = cost_map.Get(row, col, image_idx); } else { pcc_computer.src_image_idx = image_idx; cost = pcc_computer.Compute(); cost_map.Set(row, col, image_idx, cost); } const float alpha = likelihood_computer.ComputeForwardMessage( cost, forward_message[image_idx]); const float beta = sel_prob_map.Get(row, col, image_idx); const float prev_prob = prev_sel_prob_map.Get(row, col, image_idx); const float prob = likelihood_computer.ComputeSelProb( alpha, beta, prev_prob, options.prev_sel_prob_weight); forward_message[image_idx] = alpha; sel_prob_map.Set(row, col, image_idx, prob); } if (kFilterPhotoConsistency || kFilterGeomConsistency) { int num_consistent = 0; float best_point[3]; ComputePointAtDepth(row, col, best_depth, best_point); const float min_ncc_prob = likelihood_computer.ComputeNCCProb(1.0f - options.filter_min_ncc); const float cos_min_triangulation_angle = cos(options.filter_min_triangulation_angle); for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { float cos_triangulation_angle; float cos_incident_angle; ComputeViewingAngles(best_point, best_normal, image_idx, &cos_triangulation_angle, &cos_incident_angle); if (cos_triangulation_angle > cos_min_triangulation_angle || cos_incident_angle <= 0.0f) { continue; } if (!kFilterGeomConsistency) { if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob) { consistency_mask.Set(row, col, image_idx, 1); num_consistent += 1; } } else if (!kFilterPhotoConsistency) { if (ComputeGeomConsistencyCost(row, col, best_depth, image_idx, options.geom_consistency_max_cost) <= options.filter_geom_consistency_max_cost) { consistency_mask.Set(row, col, image_idx, 1); num_consistent += 1; } } else { if (sel_prob_map.Get(row, col, image_idx) >= min_ncc_prob && ComputeGeomConsistencyCost(row, col, best_depth, image_idx, options.geom_consistency_max_cost) <= options.filter_geom_consistency_max_cost) { consistency_mask.Set(row, col, image_idx, 1); num_consistent += 1; } } } if (num_consistent < options.filter_min_num_consistent) { depth_map.Set(row, col, 0.0f); normal_map.Set(row, col, 0, 0.0f); normal_map.Set(row, col, 1, 0.0f); normal_map.Set(row, col, 2, 0.0f); for (int image_idx = 0; image_idx < cost_map.GetDepth(); ++image_idx) { consistency_mask.Set(row, col, image_idx, 0); } } } // Update previous depth for next row. prev_param_state.depth = best_depth; for (int i = 0; i < 3; ++i) { prev_param_state.normal[i] = best_normal[i]; } } if (col < cost_map.GetWidth()) { rand_state_map.Set(0, col, rand_state); } } PatchMatchCuda::PatchMatchCuda(const PatchMatchOptions& options, const PatchMatch::Problem& problem) : options_(options), problem_(problem), ref_width_(0), ref_height_(0), rotation_in_half_pi_(0) { SetBestCudaDevice(std::stoi(options_.gpu_index)); InitRefImage(); InitSourceImages(); InitTransforms(); InitWorkspaceMemory(); } PatchMatchCuda::~PatchMatchCuda() { for (size_t i = 0; i < 4; ++i) { poses_device_[i].reset(); } } void PatchMatchCuda::Run() { #define CASE_WINDOW_RADIUS(window_radius, window_step) \ case window_radius: \ RunWithWindowSizeAndStep<2 * window_radius + 1, window_step>(); \ break; #define CASE_WINDOW_STEP(window_step) \ case window_step: \ switch (options_.window_radius) { \ CASE_WINDOW_RADIUS(1, window_step) \ CASE_WINDOW_RADIUS(2, window_step) \ CASE_WINDOW_RADIUS(3, window_step) \ CASE_WINDOW_RADIUS(4, window_step) \ CASE_WINDOW_RADIUS(5, window_step) \ CASE_WINDOW_RADIUS(6, window_step) \ CASE_WINDOW_RADIUS(7, window_step) \ CASE_WINDOW_RADIUS(8, window_step) \ CASE_WINDOW_RADIUS(9, window_step) \ CASE_WINDOW_RADIUS(10, window_step) \ CASE_WINDOW_RADIUS(11, window_step) \ CASE_WINDOW_RADIUS(12, window_step) \ CASE_WINDOW_RADIUS(13, window_step) \ CASE_WINDOW_RADIUS(14, window_step) \ CASE_WINDOW_RADIUS(15, window_step) \ CASE_WINDOW_RADIUS(16, window_step) \ CASE_WINDOW_RADIUS(17, window_step) \ CASE_WINDOW_RADIUS(18, window_step) \ CASE_WINDOW_RADIUS(19, window_step) \ CASE_WINDOW_RADIUS(20, window_step) \ default: { \ std::cerr << "Error: Window size not supported" << std::endl; \ break; \ } \ } \ break; switch (options_.window_step) { CASE_WINDOW_STEP(1) CASE_WINDOW_STEP(2) default: { std::cerr << "Error: Window step not supported" << std::endl; break; } } #undef SWITCH_WINDOW_RADIUS #undef CALL_RUN_FUNC } DepthMap PatchMatchCuda::GetDepthMap() const { return DepthMap(depth_map_->CopyToMat(), options_.depth_min, options_.depth_max); } NormalMap PatchMatchCuda::GetNormalMap() const { return NormalMap(normal_map_->CopyToMat()); } Mat PatchMatchCuda::GetSelProbMap() const { return prev_sel_prob_map_->CopyToMat(); } std::vector PatchMatchCuda::GetConsistentImageIdxs() const { const Mat mask = consistency_mask_->CopyToMat(); std::vector consistent_image_idxs; std::vector pixel_consistent_image_idxs; pixel_consistent_image_idxs.reserve(mask.GetDepth()); for (size_t r = 0; r < mask.GetHeight(); ++r) { for (size_t c = 0; c < mask.GetWidth(); ++c) { pixel_consistent_image_idxs.clear(); for (size_t d = 0; d < mask.GetDepth(); ++d) { if (mask.Get(r, c, d)) { pixel_consistent_image_idxs.push_back(problem_.src_image_idxs[d]); } } if (pixel_consistent_image_idxs.size() > 0) { consistent_image_idxs.push_back(c); consistent_image_idxs.push_back(r); consistent_image_idxs.push_back(pixel_consistent_image_idxs.size()); consistent_image_idxs.insert(consistent_image_idxs.end(), pixel_consistent_image_idxs.begin(), pixel_consistent_image_idxs.end()); } } } return consistent_image_idxs; } template void PatchMatchCuda::RunWithWindowSizeAndStep() { // Wait for all initializations to finish. CUDA_SYNC_AND_CHECK(); CudaTimer total_timer; CudaTimer init_timer; ComputeCudaConfig(); ComputeInitialCost <<>>( *cost_map_, *depth_map_, *normal_map_, *ref_image_->sum_image, *ref_image_->squared_sum_image, options_.sigma_spatial, options_.sigma_color); CUDA_SYNC_AND_CHECK(); init_timer.Print("Initialization"); const float total_num_steps = options_.num_iterations * 4; SweepOptions sweep_options; sweep_options.depth_min = options_.depth_min; sweep_options.depth_max = options_.depth_max; sweep_options.sigma_spatial = options_.sigma_spatial; sweep_options.sigma_color = options_.sigma_color; sweep_options.num_samples = options_.num_samples; sweep_options.ncc_sigma = options_.ncc_sigma; sweep_options.min_triangulation_angle = DEG2RAD(options_.min_triangulation_angle); sweep_options.incident_angle_sigma = options_.incident_angle_sigma; sweep_options.geom_consistency_regularizer = options_.geom_consistency_regularizer; sweep_options.geom_consistency_max_cost = options_.geom_consistency_max_cost; sweep_options.filter_min_ncc = options_.filter_min_ncc; sweep_options.filter_min_triangulation_angle = DEG2RAD(options_.filter_min_triangulation_angle); sweep_options.filter_min_num_consistent = options_.filter_min_num_consistent; sweep_options.filter_geom_consistency_max_cost = options_.filter_geom_consistency_max_cost; for (int iter = 0; iter < options_.num_iterations; ++iter) { CudaTimer iter_timer; for (int sweep = 0; sweep < 4; ++sweep) { CudaTimer sweep_timer; // Expenentially reduce amount of perturbation during the optimization. sweep_options.perturbation = 1.0f / std::pow(2.0f, iter + sweep / 4.0f); // Linearly increase the influence of previous selection probabilities. sweep_options.prev_sel_prob_weight = static_cast(iter * 4 + sweep) / total_num_steps; const bool last_sweep = iter == options_.num_iterations - 1 && sweep == 3; #define CALL_SWEEP_FUNC \ SweepFromTopToBottom \ <<>>( \ *global_workspace_, *rand_state_map_, *cost_map_, *depth_map_, \ *normal_map_, *consistency_mask_, *sel_prob_map_, \ *prev_sel_prob_map_, *ref_image_->sum_image, \ *ref_image_->squared_sum_image, sweep_options); if (last_sweep) { if (options_.filter) { consistency_mask_.reset(new GpuMat(cost_map_->GetWidth(), cost_map_->GetHeight(), cost_map_->GetDepth())); consistency_mask_->FillWithScalar(0); } if (options_.geom_consistency) { const bool kGeomConsistencyTerm = true; if (options_.filter) { const bool kFilterPhotoConsistency = true; const bool kFilterGeomConsistency = true; CALL_SWEEP_FUNC } else { const bool kFilterPhotoConsistency = false; const bool kFilterGeomConsistency = false; CALL_SWEEP_FUNC } } else { const bool kGeomConsistencyTerm = false; if (options_.filter) { const bool kFilterPhotoConsistency = true; const bool kFilterGeomConsistency = false; CALL_SWEEP_FUNC } else { const bool kFilterPhotoConsistency = false; const bool kFilterGeomConsistency = false; CALL_SWEEP_FUNC } } } else { const bool kFilterPhotoConsistency = false; const bool kFilterGeomConsistency = false; if (options_.geom_consistency) { const bool kGeomConsistencyTerm = true; CALL_SWEEP_FUNC } else { const bool kGeomConsistencyTerm = false; CALL_SWEEP_FUNC } } #undef CALL_SWEEP_FUNC CUDA_SYNC_AND_CHECK(); Rotate(); // Rotate selected image map. if (last_sweep && options_.filter) { std::unique_ptr> rot_consistency_mask_( new GpuMat(cost_map_->GetWidth(), cost_map_->GetHeight(), cost_map_->GetDepth())); consistency_mask_->Rotate(rot_consistency_mask_.get()); consistency_mask_.swap(rot_consistency_mask_); } sweep_timer.Print(" Sweep " + std::to_string(sweep + 1)); } iter_timer.Print("Iteration " + std::to_string(iter + 1)); } total_timer.Print("Total"); } void PatchMatchCuda::ComputeCudaConfig() { sweep_block_size_.x = THREADS_PER_BLOCK; sweep_block_size_.y = 1; sweep_block_size_.z = 1; sweep_grid_size_.x = (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1; sweep_grid_size_.y = 1; sweep_grid_size_.z = 1; elem_wise_block_size_.x = THREADS_PER_BLOCK; elem_wise_block_size_.y = THREADS_PER_BLOCK; elem_wise_block_size_.z = 1; elem_wise_grid_size_.x = (depth_map_->GetWidth() - 1) / THREADS_PER_BLOCK + 1; elem_wise_grid_size_.y = (depth_map_->GetHeight() - 1) / THREADS_PER_BLOCK + 1; elem_wise_grid_size_.z = 1; } void PatchMatchCuda::InitRefImage() { const Image& ref_image = problem_.images->at(problem_.ref_image_idx); ref_width_ = ref_image.GetWidth(); ref_height_ = ref_image.GetHeight(); // Upload to device. ref_image_.reset(new GpuMatRefImage(ref_width_, ref_height_)); const std::vector ref_image_array = ref_image.GetBitmap().ConvertToRowMajorArray(); ref_image_->Filter(ref_image_array.data(), options_.window_radius, options_.window_step, options_.sigma_spatial, options_.sigma_color); ref_image_device_.reset( new CudaArrayWrapper(ref_width_, ref_height_, 1)); ref_image_device_->CopyFromGpuMat(*ref_image_->image); // Create texture. ref_image_texture.addressMode[0] = cudaAddressModeBorder; ref_image_texture.addressMode[1] = cudaAddressModeBorder; ref_image_texture.addressMode[2] = cudaAddressModeBorder; ref_image_texture.filterMode = cudaFilterModePoint; ref_image_texture.normalized = false; CUDA_SAFE_CALL( cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr())); } void PatchMatchCuda::InitSourceImages() { // Determine maximum image size. size_t max_width = 0; size_t max_height = 0; for (const auto image_idx : problem_.src_image_idxs) { const Image& image = problem_.images->at(image_idx); if (image.GetWidth() > max_width) { max_width = image.GetWidth(); } if (image.GetHeight() > max_height) { max_height = image.GetHeight(); } } // Upload source images to device. { // Copy source images to contiguous memory block. const uint8_t kDefaultValue = 0; std::vector src_images_host_data( static_cast(max_width * max_height * problem_.src_image_idxs.size()), kDefaultValue); for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) { const Image& image = problem_.images->at(problem_.src_image_idxs[i]); const Bitmap& bitmap = image.GetBitmap(); uint8_t* dest = src_images_host_data.data() + max_width * max_height * i; for (size_t r = 0; r < image.GetHeight(); ++r) { memcpy(dest, bitmap.GetScanline(r), image.GetWidth() * sizeof(uint8_t)); dest += max_width; } } // Upload to device. src_images_device_.reset(new CudaArrayWrapper( max_width, max_height, problem_.src_image_idxs.size())); src_images_device_->CopyToDevice(src_images_host_data.data()); // Create source images texture. src_images_texture.addressMode[0] = cudaAddressModeBorder; src_images_texture.addressMode[1] = cudaAddressModeBorder; src_images_texture.addressMode[2] = cudaAddressModeBorder; src_images_texture.filterMode = cudaFilterModeLinear; src_images_texture.normalized = false; CUDA_SAFE_CALL(cudaBindTextureToArray(src_images_texture, src_images_device_->GetPtr())); } // Upload source depth maps to device. if (options_.geom_consistency) { const float kDefaultValue = 0.0f; std::vector src_depth_maps_host_data( static_cast(max_width * max_height * problem_.src_image_idxs.size()), kDefaultValue); for (size_t i = 0; i < problem_.src_image_idxs.size(); ++i) { const DepthMap& depth_map = problem_.depth_maps->at(problem_.src_image_idxs[i]); float* dest = src_depth_maps_host_data.data() + max_width * max_height * i; for (size_t r = 0; r < depth_map.GetHeight(); ++r) { memcpy(dest, depth_map.GetPtr() + r * depth_map.GetWidth(), depth_map.GetWidth() * sizeof(float)); dest += max_width; } } src_depth_maps_device_.reset(new CudaArrayWrapper( max_width, max_height, problem_.src_image_idxs.size())); src_depth_maps_device_->CopyToDevice(src_depth_maps_host_data.data()); // Create source depth maps texture. src_depth_maps_texture.addressMode[0] = cudaAddressModeBorder; src_depth_maps_texture.addressMode[1] = cudaAddressModeBorder; src_depth_maps_texture.addressMode[2] = cudaAddressModeBorder; // TODO: Check if linear interpolation improves results or not. src_depth_maps_texture.filterMode = cudaFilterModePoint; src_depth_maps_texture.normalized = false; CUDA_SAFE_CALL(cudaBindTextureToArray(src_depth_maps_texture, src_depth_maps_device_->GetPtr())); } } void PatchMatchCuda::InitTransforms() { const Image& ref_image = problem_.images->at(problem_.ref_image_idx); ////////////////////////////////////////////////////////////////////////////// // Generate rotated versions (counter-clockwise) of calibration matrix. ////////////////////////////////////////////////////////////////////////////// for (size_t i = 0; i < 4; ++i) { ref_K_host_[i][0] = ref_image.GetK()[0]; ref_K_host_[i][1] = ref_image.GetK()[2]; ref_K_host_[i][2] = ref_image.GetK()[4]; ref_K_host_[i][3] = ref_image.GetK()[5]; } // Rotated by 90 degrees. std::swap(ref_K_host_[1][0], ref_K_host_[1][2]); std::swap(ref_K_host_[1][1], ref_K_host_[1][3]); ref_K_host_[1][3] = ref_width_ - 1 - ref_K_host_[1][3]; // Rotated by 180 degrees. ref_K_host_[2][1] = ref_width_ - 1 - ref_K_host_[2][1]; ref_K_host_[2][3] = ref_height_ - 1 - ref_K_host_[2][3]; // Rotated by 270 degrees. std::swap(ref_K_host_[3][0], ref_K_host_[3][2]); std::swap(ref_K_host_[3][1], ref_K_host_[3][3]); ref_K_host_[3][1] = ref_height_ - 1 - ref_K_host_[3][1]; // Extract 1/fx, -cx/fx, fy, -cy/fy. for (size_t i = 0; i < 4; ++i) { ref_inv_K_host_[i][0] = 1.0f / ref_K_host_[i][0]; ref_inv_K_host_[i][1] = -ref_K_host_[i][1] / ref_K_host_[i][0]; ref_inv_K_host_[i][2] = 1.0f / ref_K_host_[i][2]; ref_inv_K_host_[i][3] = -ref_K_host_[i][3] / ref_K_host_[i][2]; } // Bind 0 degrees version to constant global memory. CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[0], sizeof(float) * 4, 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[0], sizeof(float) * 4, 0, cudaMemcpyHostToDevice)); ////////////////////////////////////////////////////////////////////////////// // Generate rotated versions of camera poses. ////////////////////////////////////////////////////////////////////////////// float rotated_R[9]; memcpy(rotated_R, ref_image.GetR(), 9 * sizeof(float)); float rotated_T[3]; memcpy(rotated_T, ref_image.GetT(), 3 * sizeof(float)); // Matrix for 90deg rotation around Z-axis in counter-clockwise direction. const float R_z90[9] = {0, 1, 0, -1, 0, 0, 0, 0, 1}; for (size_t i = 0; i < 4; ++i) { const size_t kNumTformParams = 4 + 9 + 3 + 3 + 12 + 12; std::vector poses_host_data(kNumTformParams * problem_.src_image_idxs.size()); int offset = 0; for (const auto image_idx : problem_.src_image_idxs) { const Image& image = problem_.images->at(image_idx); const float K[4] = {image.GetK()[0], image.GetK()[2], image.GetK()[4], image.GetK()[5]}; memcpy(poses_host_data.data() + offset, K, 4 * sizeof(float)); offset += 4; float rel_R[9]; float rel_T[3]; ComputeRelativePose(rotated_R, rotated_T, image.GetR(), image.GetT(), rel_R, rel_T); memcpy(poses_host_data.data() + offset, rel_R, 9 * sizeof(float)); offset += 9; memcpy(poses_host_data.data() + offset, rel_T, 3 * sizeof(float)); offset += 3; float C[3]; ComputeProjectionCenter(rel_R, rel_T, C); memcpy(poses_host_data.data() + offset, C, 3 * sizeof(float)); offset += 3; float P[12]; ComposeProjectionMatrix(image.GetK(), rel_R, rel_T, P); memcpy(poses_host_data.data() + offset, P, 12 * sizeof(float)); offset += 12; float inv_P[12]; ComposeInverseProjectionMatrix(image.GetK(), rel_R, rel_T, inv_P); memcpy(poses_host_data.data() + offset, inv_P, 12 * sizeof(float)); offset += 12; } poses_device_[i].reset(new CudaArrayWrapper( kNumTformParams, problem_.src_image_idxs.size(), 1)); poses_device_[i]->CopyToDevice(poses_host_data.data()); RotatePose(R_z90, rotated_R, rotated_T); } poses_texture.addressMode[0] = cudaAddressModeBorder; poses_texture.addressMode[1] = cudaAddressModeBorder; poses_texture.addressMode[2] = cudaAddressModeBorder; poses_texture.filterMode = cudaFilterModePoint; poses_texture.normalized = false; CUDA_SAFE_CALL( cudaBindTextureToArray(poses_texture, poses_device_[0]->GetPtr())); } void PatchMatchCuda::InitWorkspaceMemory() { rand_state_map_.reset(new GpuMatPRNG(ref_width_, ref_height_)); depth_map_.reset(new GpuMat(ref_width_, ref_height_)); if (options_.geom_consistency) { const DepthMap& init_depth_map = problem_.depth_maps->at(problem_.ref_image_idx); depth_map_->CopyToDevice(init_depth_map.GetPtr(), init_depth_map.GetWidth() * sizeof(float)); } else { depth_map_->FillWithRandomNumbers(options_.depth_min, options_.depth_max, *rand_state_map_); } normal_map_.reset(new GpuMat(ref_width_, ref_height_, 3)); // Note that it is not necessary to keep the selection probability map in // memory for all pixels. Theoretically, it is possible to incorporate // the temporary selection probabilities in the global_workspace_. // However, it is useful to keep the probabilities for the entire image // in memory, so that it can be exported. sel_prob_map_.reset(new GpuMat(ref_width_, ref_height_, problem_.src_image_idxs.size())); prev_sel_prob_map_.reset(new GpuMat(ref_width_, ref_height_, problem_.src_image_idxs.size())); prev_sel_prob_map_->FillWithScalar(0.5f); cost_map_.reset(new GpuMat(ref_width_, ref_height_, problem_.src_image_idxs.size())); const int ref_max_dim = std::max(ref_width_, ref_height_); global_workspace_.reset( new GpuMat(ref_max_dim, problem_.src_image_idxs.size(), 2)); consistency_mask_.reset(new GpuMat(0, 0, 0)); ComputeCudaConfig(); if (options_.geom_consistency) { const NormalMap& init_normal_map = problem_.normal_maps->at(problem_.ref_image_idx); normal_map_->CopyToDevice(init_normal_map.GetPtr(), init_normal_map.GetWidth() * sizeof(float)); } else { InitNormalMap<<>>( *normal_map_, *rand_state_map_); } } void PatchMatchCuda::Rotate() { rotation_in_half_pi_ = (rotation_in_half_pi_ + 1) % 4; size_t width; size_t height; if (rotation_in_half_pi_ % 2 == 0) { width = ref_width_; height = ref_height_; } else { width = ref_height_; height = ref_width_; } // Rotate random map. { std::unique_ptr rotated_rand_state_map( new GpuMatPRNG(width, height)); rand_state_map_->Rotate(rotated_rand_state_map.get()); rand_state_map_.swap(rotated_rand_state_map); } // Rotate depth map. { std::unique_ptr> rotated_depth_map( new GpuMat(width, height)); depth_map_->Rotate(rotated_depth_map.get()); depth_map_.swap(rotated_depth_map); } // Rotate normal map. { RotateNormalMap<<>>( *normal_map_); std::unique_ptr> rotated_normal_map( new GpuMat(width, height, 3)); normal_map_->Rotate(rotated_normal_map.get()); normal_map_.swap(rotated_normal_map); } // Rotate reference image. { std::unique_ptr rotated_ref_image( new GpuMatRefImage(width, height)); ref_image_->image->Rotate(rotated_ref_image->image.get()); ref_image_->sum_image->Rotate(rotated_ref_image->sum_image.get()); ref_image_->squared_sum_image->Rotate( rotated_ref_image->squared_sum_image.get()); ref_image_.swap(rotated_ref_image); } // Bind rotated reference image to texture. ref_image_device_.reset(new CudaArrayWrapper(width, height, 1)); ref_image_device_->CopyFromGpuMat(*ref_image_->image); CUDA_SAFE_CALL(cudaUnbindTexture(ref_image_texture)); CUDA_SAFE_CALL( cudaBindTextureToArray(ref_image_texture, ref_image_device_->GetPtr())); // Rotate selection probability map. prev_sel_prob_map_.reset( new GpuMat(width, height, problem_.src_image_idxs.size())); sel_prob_map_->Rotate(prev_sel_prob_map_.get()); sel_prob_map_.reset( new GpuMat(width, height, problem_.src_image_idxs.size())); // Rotate cost map. { std::unique_ptr> rotated_cost_map( new GpuMat(width, height, problem_.src_image_idxs.size())); cost_map_->Rotate(rotated_cost_map.get()); cost_map_.swap(rotated_cost_map); } // Rotate transformations. CUDA_SAFE_CALL(cudaUnbindTexture(poses_texture)); CUDA_SAFE_CALL(cudaBindTextureToArray( poses_texture, poses_device_[rotation_in_half_pi_]->GetPtr())); // Rotate calibration. CUDA_SAFE_CALL(cudaMemcpyToSymbol(ref_K, ref_K_host_[rotation_in_half_pi_], sizeof(float) * 4, 0, cudaMemcpyHostToDevice)); CUDA_SAFE_CALL( cudaMemcpyToSymbol(ref_inv_K, ref_inv_K_host_[rotation_in_half_pi_], sizeof(float) * 4, 0, cudaMemcpyHostToDevice)); // Recompute Cuda configuration for rotated reference image. ComputeCudaConfig(); } } // namespace mvs } // namespace colmap