|
|
#pragma once
|
|
|
#include <ATen/core/TensorAccessor.h>
|
|
|
#include <ATen/cuda/Atomic.cuh>
|
|
|
|
|
|
#include <c10/util/ArrayRef.h>
|
|
|
#include <c10/util/SmallVector.h>
|
|
|
#include <c10/util/OptionalArrayRef.h>
|
|
|
|
|
|
#include <math.h>
|
|
|
#include <optional>
|
|
|
|
|
|
namespace at::native {
|
|
|
|
|
|
namespace upsample {
|
|
|
|
|
|
TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
|
|
|
c10::IntArrayRef input_size,
|
|
|
at::OptionalIntArrayRef output_size,
|
|
|
std::optional<c10::ArrayRef<double>> scale_factors);
|
|
|
}
|
|
|
|
|
|
namespace upsample_cuda {
|
|
|
|
|
|
|
|
|
inline std::optional<double> get_scale_value(std::optional<c10::ArrayRef<double>> scales, int idx) {
|
|
|
if (!scales) {
|
|
|
return std::nullopt;
|
|
|
}
|
|
|
return scales->at(idx);
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename scalar_t>
|
|
|
__device__ inline scalar_t min(scalar_t a, scalar_t b) {
|
|
|
return a < b ? a : b;
|
|
|
}
|
|
|
|
|
|
template <typename scalar_t>
|
|
|
__device__ inline scalar_t max(scalar_t a, scalar_t b) {
|
|
|
return a > b ? a : b;
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__host__ __forceinline__ accscalar_t compute_scales_value(
|
|
|
const std::optional<double> scale,
|
|
|
int64_t src_size,
|
|
|
int64_t dst_size) {
|
|
|
|
|
|
return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value())
|
|
|
: (accscalar_t)src_size / dst_size;
|
|
|
}
|
|
|
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__host__ __forceinline__ accscalar_t compute_scales_value_backwards(
|
|
|
const std::optional<double> scale,
|
|
|
int64_t src_size,
|
|
|
int64_t dst_size) {
|
|
|
|
|
|
return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value()
|
|
|
: (accscalar_t)src_size / dst_size;
|
|
|
}
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__host__ __forceinline__ accscalar_t area_pixel_compute_scale(
|
|
|
int input_size,
|
|
|
int output_size,
|
|
|
bool align_corners,
|
|
|
const std::optional<double> scale) {
|
|
|
if(align_corners) {
|
|
|
if(output_size > 1) {
|
|
|
return (accscalar_t)(input_size - 1) / (output_size - 1);
|
|
|
}
|
|
|
else {
|
|
|
return static_cast<accscalar_t>(0);
|
|
|
}
|
|
|
}
|
|
|
else{
|
|
|
return compute_scales_value<accscalar_t>(scale, input_size, output_size);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ __forceinline__ accscalar_t area_pixel_compute_source_index(
|
|
|
accscalar_t scale,
|
|
|
int dst_index,
|
|
|
bool align_corners,
|
|
|
bool cubic) {
|
|
|
if (align_corners) {
|
|
|
return scale * dst_index;
|
|
|
} else {
|
|
|
accscalar_t src_idx = scale * (dst_index + static_cast<accscalar_t>(0.5)) -
|
|
|
static_cast<accscalar_t>(0.5);
|
|
|
|
|
|
return (!cubic && src_idx < static_cast<accscalar_t>(0))
|
|
|
? static_cast<accscalar_t>(0)
|
|
|
: src_idx;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ int nearest_neighbor_compute_source_index(
|
|
|
const float scale,
|
|
|
int dst_index,
|
|
|
int input_size) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const int src_index =
|
|
|
min(static_cast<int>(floorf((dst_index) * scale)), input_size - 1);
|
|
|
return src_index;
|
|
|
}
|
|
|
|
|
|
__device__ __forceinline__ int nearest_neighbor_exact_compute_source_index(
|
|
|
const float scale,
|
|
|
int dst_index,
|
|
|
int input_size) {
|
|
|
|
|
|
|
|
|
|
|
|
const int src_index =
|
|
|
min(static_cast<int>(floorf((dst_index + static_cast<float>(0.5)) * scale)), input_size - 1);
|
|
|
return src_index;
|
|
|
}
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ int nearest_neighbor_bw_compute_source_index(
|
|
|
const float scale,
|
|
|
int dst_index,
|
|
|
int output_size) {
|
|
|
|
|
|
|
|
|
|
|
|
const int src_index =
|
|
|
min(static_cast<int>(ceilf(dst_index * scale)), output_size);
|
|
|
return src_index;
|
|
|
}
|
|
|
|
|
|
|
|
|
__device__ __forceinline__ int nearest_neighbor_exact_bw_compute_source_index(
|
|
|
const float scale,
|
|
|
int dst_index,
|
|
|
int output_size) {
|
|
|
|
|
|
const int src_index =
|
|
|
min(static_cast<int>(ceilf(dst_index * scale - static_cast<float>(0.5))), output_size);
|
|
|
return src_index;
|
|
|
}
|
|
|
|
|
|
|
|
|
template <typename scalar_t>
|
|
|
__device__ __forceinline__ scalar_t upsample_get_value_bounded(
|
|
|
const PackedTensorAccessor64<const scalar_t, 4>& data,
|
|
|
int batch,
|
|
|
int channel,
|
|
|
int height,
|
|
|
int width,
|
|
|
int y,
|
|
|
int x) {
|
|
|
int access_y = max(min(y, height - 1), 0);
|
|
|
int access_x = max(min(x, width - 1), 0);
|
|
|
return data[batch][channel][access_y][access_x];
|
|
|
}
|
|
|
|
|
|
|
|
|
template <typename scalar_t, typename accscalar_t>
|
|
|
__device__ __forceinline__ void upsample_increment_value_bounded(
|
|
|
PackedTensorAccessor64<scalar_t, 4>& data,
|
|
|
int batch,
|
|
|
int channel,
|
|
|
int height,
|
|
|
int width,
|
|
|
int y,
|
|
|
int x,
|
|
|
accscalar_t value) {
|
|
|
int access_y = max(min(y, height - 1), 0);
|
|
|
int access_x = max(min(x, width - 1), 0);
|
|
|
|
|
|
|
|
|
|
|
|
gpuAtomicAddNoReturn(
|
|
|
&data[batch][channel][access_y][access_x], static_cast<scalar_t>(value));
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ __forceinline__ accscalar_t cubic_convolution1(
|
|
|
accscalar_t x,
|
|
|
accscalar_t A) {
|
|
|
return ((A + 2) * x - (A + 3)) * x * x + 1;
|
|
|
}
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ __forceinline__ accscalar_t cubic_convolution2(
|
|
|
accscalar_t x,
|
|
|
accscalar_t A) {
|
|
|
return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
|
|
|
}
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ __forceinline__ void get_cubic_upsampling_coefficients(
|
|
|
accscalar_t coeffs[4],
|
|
|
accscalar_t t) {
|
|
|
accscalar_t A = -0.75;
|
|
|
|
|
|
accscalar_t x1 = t;
|
|
|
coeffs[0] = cubic_convolution2<accscalar_t>(x1 + 1.0, A);
|
|
|
coeffs[1] = cubic_convolution1<accscalar_t>(x1, A);
|
|
|
|
|
|
|
|
|
accscalar_t x2 = 1.0 - t;
|
|
|
coeffs[2] = cubic_convolution1<accscalar_t>(x2, A);
|
|
|
coeffs[3] = cubic_convolution2<accscalar_t>(x2 + 1.0, A);
|
|
|
}
|
|
|
|
|
|
template <typename scalar_t, typename accscalar_t>
|
|
|
__device__ __forceinline__ accscalar_t cubic_interp1d(
|
|
|
scalar_t x0,
|
|
|
scalar_t x1,
|
|
|
scalar_t x2,
|
|
|
scalar_t x3,
|
|
|
accscalar_t t) {
|
|
|
accscalar_t coeffs[4];
|
|
|
get_cubic_upsampling_coefficients<accscalar_t>(coeffs, t);
|
|
|
|
|
|
return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
|
|
|
}
|
|
|
|
|
|
namespace upsample_antialias {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct BilinearFilterFunctor {
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ accscalar_t operator()(accscalar_t x) const {
|
|
|
if (x < 0) {
|
|
|
x = -x;
|
|
|
}
|
|
|
if (x < 1) {
|
|
|
return 1 - x;
|
|
|
}
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static const int size = 2;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct BicubicFilterFunctor {
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ accscalar_t operator()(accscalar_t x) const {
|
|
|
|
|
|
const accscalar_t a = -0.5;
|
|
|
if (x < 0) {
|
|
|
x = -x;
|
|
|
}
|
|
|
if (x < 1) {
|
|
|
return ((a + 2) * x - (a + 3)) * x * x + 1;
|
|
|
}
|
|
|
if (x < 2) {
|
|
|
return (((x - 5) * x + 8) * x - 4) * a;
|
|
|
}
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static const int size = 4;
|
|
|
};
|
|
|
|
|
|
template <typename accscalar_t>
|
|
|
__device__ __forceinline__ void _compute_weights_span(
|
|
|
const int i,
|
|
|
const int input_size,
|
|
|
const accscalar_t scale,
|
|
|
const accscalar_t support,
|
|
|
int& xmin,
|
|
|
int& xsize,
|
|
|
accscalar_t& center) {
|
|
|
center = scale * (i + static_cast<accscalar_t>(0.5));
|
|
|
xmin = max(static_cast<int>(center - support + static_cast<accscalar_t>(0.5)), static_cast<int>(0));
|
|
|
xsize = min(static_cast<int>(center + support + static_cast<accscalar_t>(0.5)), input_size) - xmin;
|
|
|
}
|
|
|
|
|
|
template <typename scalar_t, typename accscalar_t, typename interp_filter_t>
|
|
|
__device__ __forceinline__ void _compute_weights(
|
|
|
scalar_t* wt_ptr,
|
|
|
const accscalar_t scale,
|
|
|
int interp_size,
|
|
|
const interp_filter_t& interp_filter,
|
|
|
accscalar_t xmin_m_center,
|
|
|
int xsize) {
|
|
|
|
|
|
accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
|
|
|
accscalar_t total_w = 0.0;
|
|
|
int j = 0;
|
|
|
for (j = 0; j < xsize; j++) {
|
|
|
accscalar_t w = interp_filter((j + xmin_m_center + static_cast<accscalar_t>(0.5)) * invscale);
|
|
|
wt_ptr[j] = static_cast<scalar_t>(w);
|
|
|
total_w += w;
|
|
|
}
|
|
|
for (j = 0; j < xsize; j++) {
|
|
|
if (total_w != 0.0) {
|
|
|
wt_ptr[j] /= total_w;
|
|
|
}
|
|
|
}
|
|
|
for (; j < interp_size; j++) {
|
|
|
wt_ptr[j] = static_cast<scalar_t>(0.0);
|
|
|
}
|
|
|
}
|
|
|
|
|
|
template <typename scalar_t, typename accscalar_t>
|
|
|
__device__ __forceinline__ accscalar_t interpolate_aa_single_dim(
|
|
|
const scalar_t* src,
|
|
|
const scalar_t* weights,
|
|
|
int size) {
|
|
|
scalar_t t = static_cast<accscalar_t>(*src);
|
|
|
scalar_t wts = static_cast<accscalar_t>(weights[0]);
|
|
|
accscalar_t output = t * wts;
|
|
|
|
|
|
int j = 1;
|
|
|
for (; j < size; j++) {
|
|
|
wts = static_cast<accscalar_t>(weights[j]);
|
|
|
t = static_cast<accscalar_t>(*(src + j));
|
|
|
output += t * wts;
|
|
|
}
|
|
|
return output;
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|