| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| #pragma once |
|
|
| #include <gpu/common.h> |
| #include <atomic> |
| #include <stdexcept> |
| #include <stdint.h> |
| #include <string> |
| #include <vector> |
|
|
| |
| #define CUDA_CHECK_THROW(x) \ |
| do { \ |
| cudaError_t result = x; \ |
| if (result != cudaSuccess) \ |
| throw std::runtime_error(std::string("CUDA Error: " #x " failed with error ") + cudaGetErrorString(result)); \ |
| } while(0) |
|
|
|
|
| namespace cubvh { |
|
|
| #define DEBUG_GUARD_SIZE 0 |
|
|
| inline std::atomic<size_t>& total_n_bytes_allocated() { |
| static std::atomic<size_t> s_total_n_bytes_allocated{0}; |
| return s_total_n_bytes_allocated; |
| } |
|
|
| |
| template<class T> |
| class GPUMemory { |
| private: |
| T* m_data = nullptr; |
| size_t m_size = 0; |
| bool m_owned = true; |
|
|
| public: |
| GPUMemory() {} |
|
|
| GPUMemory<T>& operator=(GPUMemory<T>&& other) { |
| std::swap(m_data, other.m_data); |
| std::swap(m_size, other.m_size); |
| return *this; |
| } |
|
|
| GPUMemory(GPUMemory<T>&& other) { |
| *this = std::move(other); |
| } |
|
|
| __host__ __device__ GPUMemory(const GPUMemory<T> &other) : m_data{other.m_data}, m_size{other.m_size}, m_owned{false} {} |
|
|
| void check_guards() const { |
| #if DEBUG_GUARD_SIZE > 0 |
| if (!m_data) |
| return; |
| uint8_t buf[DEBUG_GUARD_SIZE]; |
| const uint8_t *rawptr=(const uint8_t *)m_data; |
| cudaMemcpy(buf, rawptr-DEBUG_GUARD_SIZE, DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); |
| for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xff) { |
| printf("TRASH BEFORE BLOCK offset %d data %p, read 0x%02x expected 0xff!\n", i, m_data, buf[i] ); |
| break; |
| } |
| cudaMemcpy(buf, rawptr+m_size*sizeof(T), DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); |
| for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xfe) { |
| printf("TRASH AFTER BLOCK offset %d data %p, read 0x%02x expected 0xfe!\n", i, m_data, buf[i] ); |
| break; |
| } |
| #endif |
| } |
|
|
| void allocate_memory(size_t n_bytes) { |
| if (n_bytes == 0) { |
| return; |
| } |
|
|
| #ifdef TCNN_VERBOSE_MEMORY_ALLOCS |
| std::cout << "GPUMemory: Allocating " << bytes_to_string(n_bytes) << "." << std::endl; |
| #endif |
|
|
| uint8_t *rawptr = nullptr; |
| CUDA_CHECK_THROW(cudaMalloc(&rawptr, n_bytes+DEBUG_GUARD_SIZE*2)); |
| #if DEBUG_GUARD_SIZE > 0 |
| CUDA_CHECK_THROW(cudaMemset(rawptr , 0xff, DEBUG_GUARD_SIZE)); |
| CUDA_CHECK_THROW(cudaMemset(rawptr+n_bytes+DEBUG_GUARD_SIZE , 0xfe, DEBUG_GUARD_SIZE)); |
| #endif |
| if (rawptr) rawptr+=DEBUG_GUARD_SIZE; |
| m_data=(T*)(rawptr); |
| total_n_bytes_allocated() += n_bytes; |
| } |
|
|
| void free_memory() { |
| if (!m_data) { |
| return; |
| } |
|
|
| uint8_t *rawptr = (uint8_t*)m_data; |
| if (rawptr) rawptr-=DEBUG_GUARD_SIZE; |
| CUDA_CHECK_THROW(cudaFree(rawptr)); |
|
|
| total_n_bytes_allocated() -= get_bytes(); |
|
|
| m_data = nullptr; |
| } |
|
|
| |
| GPUMemory(const size_t size) { |
| resize(size); |
| } |
|
|
| |
| __host__ __device__ ~GPUMemory() { |
| #ifndef __CUDA_ARCH__ |
| if (!m_owned) { |
| return; |
| } |
|
|
| try { |
| if (m_data) { |
| free_memory(); |
| m_size = 0; |
| } |
| } catch (std::runtime_error error) { |
| |
| if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { |
| fprintf(stderr, "Could not free memory: %s\n", error.what()); |
| } |
| } |
| #endif |
| } |
|
|
| |
| |
| |
| |
| void resize(const size_t size) { |
| if (!m_owned) { |
| throw std::runtime_error("Cannot resize non-owned memory."); |
| } |
|
|
| if (m_size != size) { |
| if (m_size) { |
| try { |
| free_memory(); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not free memory: ") + error.what()); |
| } |
| } |
|
|
| if (size > 0) { |
| try { |
| allocate_memory(size * sizeof(T)); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not allocate memory: ") + error.what()); |
| } |
| } |
|
|
| m_size = size; |
| } |
| } |
|
|
| |
| void enlarge(const size_t size) { |
| if (size > m_size) { |
| resize(size); |
| } |
| } |
| |
|
|
| |
| |
| |
| |
| void memset(const int value, const size_t num_elements, const size_t offset = 0) { |
| if (num_elements + offset > m_size) { |
| throw std::runtime_error("Could not set memory: Number of elements larger than allocated memory"); |
| } |
|
|
| try { |
| CUDA_CHECK_THROW(cudaMemset(m_data + offset, value, num_elements * sizeof(T))); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not set memory: ") + error.what()); |
| } |
| } |
|
|
| |
| void memset(const int value) { |
| memset(value, m_size); |
| } |
| |
|
|
| |
| |
| |
| |
| void copy_from_host(const T* host_data, const size_t num_elements) { |
| try { |
| CUDA_CHECK_THROW(cudaMemcpy(data(), host_data, num_elements * sizeof(T), cudaMemcpyHostToDevice)); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not copy from host: ") + error.what()); |
| } |
| } |
|
|
| |
| void copy_from_host(const std::vector<T>& data, const size_t num_elements) { |
| if (data.size() < num_elements) { |
| throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); |
| } |
| copy_from_host(data.data(), num_elements); |
| } |
|
|
| |
| void copy_from_host(const T* data) { |
| copy_from_host(data, m_size); |
| } |
|
|
| |
| void enlarge_and_copy_from_host(const T* data, const size_t num_elements) { |
| enlarge(num_elements); |
| copy_from_host(data, num_elements); |
| } |
|
|
| |
| void enlarge_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) { |
| enlarge_and_copy_from_host(data.data(), num_elements); |
| } |
|
|
| |
| void enlarge_and_copy_from_host(const std::vector<T>& data) { |
| enlarge_and_copy_from_host(data.data(), data.size()); |
| } |
|
|
| |
| void resize_and_copy_from_host(const T* data, const size_t num_elements) { |
| resize(num_elements); |
| copy_from_host(data, num_elements); |
| } |
|
|
| |
| void resize_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) { |
| resize_and_copy_from_host(data.data(), num_elements); |
| } |
|
|
| |
| void resize_and_copy_from_host(const std::vector<T>& data) { |
| resize_and_copy_from_host(data.data(), data.size()); |
| } |
|
|
| |
| void copy_from_host(const std::vector<T>& data) { |
| if (data.size() < m_size) { |
| throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); |
| } |
| copy_from_host(data.data(), m_size); |
| } |
|
|
| |
| void copy_to_host(T* host_data, const size_t num_elements) const { |
| if (num_elements > m_size) { |
| throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(m_size)); |
| } |
| try { |
| CUDA_CHECK_THROW(cudaMemcpy(host_data, data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost)); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not copy to host: ") + error.what()); |
| } |
| } |
|
|
| |
| void copy_to_host(std::vector<T>& data, const size_t num_elements) const { |
| if (data.size() < num_elements) { |
| throw std::runtime_error(std::string("Trying to copy ") + std::to_string(num_elements) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); |
| } |
| copy_to_host(data.data(), num_elements); |
| } |
|
|
| |
| void copy_to_host(T* data) const { |
| copy_to_host(data, m_size); |
| } |
|
|
| |
| void copy_to_host(std::vector<T>& data) const { |
| if (data.size() < m_size) { |
| throw std::runtime_error(std::string("Trying to copy ") + std::to_string(m_size) + std::string(" elements, but vector size is only ") + std::to_string(data.size())); |
| } |
| copy_to_host(data.data(), m_size); |
| } |
|
|
| |
| void copy_from_device(const GPUMemory<T> &other) { |
| if (m_size != other.m_size) { |
| resize(other.m_size); |
| } |
|
|
| try { |
| CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, m_size * sizeof(T), cudaMemcpyDeviceToDevice)); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not copy from device: ") + error.what()); |
| } |
| } |
|
|
| |
| void copy_from_device(const GPUMemory<T> &other, const size_t size) { |
| if (m_size < size) { |
| resize(size); |
| } |
|
|
| try { |
| CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, size * sizeof(T), cudaMemcpyDeviceToDevice)); |
| } catch (std::runtime_error error) { |
| throw std::runtime_error(std::string("Could not copy from device: ") + error.what()); |
| } |
| } |
|
|
| |
| GPUMemory<T> copy() const { |
| GPUMemory<T> result{m_size}; |
| result.copy_from_device(*this); |
| return result; |
| } |
|
|
| T* data() const { |
| check_guards(); |
| return m_data; |
| } |
|
|
| __host__ __device__ T& operator[](size_t idx) const { |
| #ifdef DEBUG_BUFFER_OVERRUN |
| if (idx > m_size) { |
| printf("WARNING: buffer overrun of %p at idx %zu\n", idx); |
| } |
| #endif |
| return m_data[idx]; |
| } |
|
|
| __host__ __device__ T& operator[](uint32_t idx) const { |
| #ifdef DEBUG_BUFFER_OVERRUN |
| if (idx > m_size) { |
| printf("WARNING: buffer overrun of %p at idx %u\n", idx); |
| } |
| #endif |
| return m_data[idx]; |
| } |
|
|
| size_t get_num_elements() const { |
| return m_size; |
| } |
|
|
| size_t size() const { |
| return get_num_elements(); |
| } |
|
|
| size_t get_bytes() const { |
| return m_size * sizeof(T); |
| } |
|
|
| size_t bytes() const { |
| return get_bytes(); |
| } |
| }; |
|
|
| } |