File size: 7,020 Bytes
f5bb0c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
#ifndef OPENPOSE_PRIVATE_GPU_CUDA_HU
#define OPENPOSE_PRIVATE_GPU_CUDA_HU
// Note: This class should only be included if CUDA is enabled
#include <cuda.h>
#include <cuda_runtime.h>
namespace op
{
// VERY IMPORTANT: These fast functions does NOT work for negative integer numbers.
// E.g., positiveIntRound(-180.f) = -179.
// Round functions
// Signed
template<typename T>
inline __device__ char positiveCharRoundCuda(const T a)
{
return char(a+0.5f);
}
template<typename T>
inline __device__ signed char positiveSCharRoundCuda(const T a)
{
return (signed char)(a+0.5f);
}
template<typename T>
inline __device__ int positiveIntRoundCuda(const T a)
{
return int(a+0.5f);
}
template<typename T>
inline __device__ long positiveLongRoundCuda(const T a)
{
return long(a+0.5f);
}
template<typename T>
inline __device__ long long positiveLongLongRoundCuda(const T a)
{
return (long long)(a+0.5f);
}
// Unsigned
template<typename T>
inline __device__ unsigned char uCharRoundCuda(const T a)
{
return (unsigned char)(a+0.5f);
}
template<typename T>
inline __device__ unsigned int uIntRoundCuda(const T a)
{
return (unsigned int)(a+0.5f);
}
template<typename T>
inline __device__ unsigned long ulongRoundCuda(const T a)
{
return (unsigned long)(a+0.5f);
}
template<typename T>
inline __device__ unsigned long long uLongLongRoundCuda(const T a)
{
return (unsigned long long)(a+0.5f);
}
// Max/min functions
template<class T>
inline __device__ T fastMaxCuda(const T a, const T b)
{
return (a > b ? a : b);
}
template<class T>
inline __device__ T fastMinCuda(const T a, const T b)
{
return (a < b ? a : b);
}
template<class T>
inline __device__ T fastTruncateCuda(const T value, const T min = 0, const T max = 1)
{
return fastMinCuda(max, fastMaxCuda(min, value));
}
// Cubic interpolation
template <typename T>
inline __device__ void cubicSequentialData(
int* xIntArray, int* yIntArray, T& dx, T& dy, const T xSource, const T ySource, const int widthSource,
const int heightSource)
{
xIntArray[1] = fastTruncateCuda(int(floor(xSource)), 0, widthSource - 1);
xIntArray[0] = fastMaxCuda(0, xIntArray[1] - 1);
xIntArray[2] = fastMinCuda(widthSource - 1, xIntArray[1] + 1);
xIntArray[3] = fastMinCuda(widthSource - 1, xIntArray[2] + 1);
dx = xSource - xIntArray[1];
yIntArray[1] = fastTruncateCuda(int(floor(ySource)), 0, heightSource - 1);
yIntArray[0] = fastMaxCuda(0, yIntArray[1] - 1);
yIntArray[2] = fastMinCuda(heightSource - 1, yIntArray[1] + 1);
yIntArray[3] = fastMinCuda(heightSource - 1, yIntArray[2] + 1);
dy = ySource - yIntArray[1];
}
template <typename T>
inline __device__ T cubicInterpolate(const T v0, const T v1, const T v2, const T v3, const T dx)
{
// http://www.paulinternet.nl/?page=bicubic
// const auto a = (-0.5f * v0 + 1.5f * v1 - 1.5f * v2 + 0.5f * v3);
// const auto b = (v0 - 2.5f * v1 + 2.0 * v2 - 0.5 * v3);
// const auto c = (-0.5f * v0 + 0.5f * v2);
// out = ((a * dx + b) * dx + c) * dx + v1;
return (-0.5f * v0 + 1.5f * v1 - 1.5f * v2 + 0.5f * v3) * dx * dx * dx
+ (v0 - 2.5f * v1 + 2.f * v2 - 0.5f * v3) * dx * dx
- 0.5f * (v0 - v2) * dx // + (-0.5f * v0 + 0.5f * v2) * dx
+ v1;
// return v1 + 0.5f * dx * (v2 - v0 + dx * (2.f * v0 - 5.f * v1 + 4.f * v2 - v3 + dx * (3.f * (v1 - v2) + v3 - v0)));
}
template <typename T>
inline __device__ T bicubicInterpolate(
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
const int widthSourcePtr)
{
int xIntArray[4];
int yIntArray[4];
T dx;
T dy;
cubicSequentialData(xIntArray, yIntArray, dx, dy, xSource, ySource, widthSource, heightSource);
T temp[4];
for (unsigned char i = 0; i < 4; i++)
{
const auto offset = yIntArray[i]*widthSourcePtr;
temp[i] = cubicInterpolate(
sourcePtr[offset + xIntArray[0]], sourcePtr[offset + xIntArray[1]], sourcePtr[offset + xIntArray[2]],
sourcePtr[offset + xIntArray[3]], dx);
}
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T bicubicInterpolate(
const unsigned char* const sourcePtr, const T xSource, const T ySource, const int widthSource,
const int heightSource, const int widthSourcePtr)
{
int xIntArray[4];
int yIntArray[4];
T dx;
T dy;
cubicSequentialData(xIntArray, yIntArray, dx, dy, xSource, ySource, widthSource, heightSource);
T temp[4];
for (unsigned char i = 0; i < 4; i++)
{
const auto offset = yIntArray[i]*widthSourcePtr;
temp[i] = cubicInterpolate(
T(sourcePtr[offset + xIntArray[0]]), T(sourcePtr[offset + xIntArray[1]]),
T(sourcePtr[offset + xIntArray[2]]), T(sourcePtr[offset + xIntArray[3]]), dx);
}
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T bicubicInterpolate8Times(
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
const int threadIdxX, const int threadIdxY)
{
// Now we only need dx and dy
const T dx = xSource - fastTruncateCuda(int(floor(xSource)), 0, widthSource - 1);
const T dy = ySource - fastTruncateCuda(int(floor(ySource)), 0, heightSource - 1);
T temp[4];
for (unsigned char i = 0; i < 4; i++)
{
const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0);
temp[i] = cubicInterpolate(
sourcePtr[offset], sourcePtr[offset+1], sourcePtr[offset+2],
sourcePtr[offset+3], dx);
}
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
}
template <typename T>
inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
{
return (1.f - alphaValue2) * value1 + alphaValue2 * value2;
}
template <typename T>
inline __device__ void addColorWeighted(
T& colorR, T& colorG, T& colorB, const T* const colorToAdd, const T alphaColorToAdd)
{
colorR = addWeighted(colorR, colorToAdd[0], alphaColorToAdd);
colorG = addWeighted(colorG, colorToAdd[1], alphaColorToAdd);
colorB = addWeighted(colorB, colorToAdd[2], alphaColorToAdd);
}
}
#endif // OPENPOSE_PRIVATE_GPU_CUDA_HU
|