Spaces:
Build error
Build error
| /* | |
| * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| * SPDX-License-Identifier: Apache-2.0 | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| */ | |
| /** @file gpu_memory.h | |
| * @author Thomas Müller and Nikolaus Binder, NVIDIA | |
| * @brief Managed memory on the GPU. Like a std::vector, memory is allocated either explicitly (resize/enlarge) | |
| * or implicitly (resize_and_copy_from_host etc). Memory is always and automatically released in the destructor. | |
| * Also contains a GPU memory arena for light-weight stream-ordered allocations of temporary memory. The | |
| * memory arena makes use of virtual memory when available to avoid re-allocations during progressive growing. | |
| */ | |
| namespace tcnn { | |
| inline std::atomic<size_t>& total_n_bytes_allocated() { | |
| static std::atomic<size_t> s_total_n_bytes_allocated{0}; | |
| return s_total_n_bytes_allocated; | |
| } | |
| /// Managed memory on the Device | |
| template<class T> | |
| class GPUMemory { | |
| private: | |
| T* m_data = nullptr; | |
| size_t m_size = 0; // Number of elements | |
| bool m_managed = false; | |
| public: | |
| using Type = T; | |
| using View = T*; | |
| using ConstView = const T*; | |
| GPUMemory() {} | |
| GPUMemory(size_t size, bool managed = false) : m_managed{managed} { | |
| resize(size); | |
| } | |
| GPUMemory<T>& operator=(GPUMemory<T>&& other) { | |
| std::swap(m_data, other.m_data); | |
| std::swap(m_size, other.m_size); | |
| std::swap(m_managed, other.m_managed); | |
| return *this; | |
| } | |
| GPUMemory(GPUMemory<T>&& other) { | |
| *this = std::move(other); | |
| } | |
| // Don't permit copy assignment to prevent performance accidents. | |
| // Copy is permitted through an explicit copy constructor. | |
| GPUMemory<T>& operator=(const GPUMemory<T>& other) = delete; | |
| explicit GPUMemory(const GPUMemory<T>& other) { | |
| m_managed = other.managed(); | |
| copy_from_device(other); | |
| } | |
| void check_guards() const { | |
| if (!m_data) | |
| return; | |
| uint8_t buf[DEBUG_GUARD_SIZE]; | |
| const uint8_t *rawptr=(const uint8_t *)m_data; | |
| cudaMemcpy(buf, rawptr-DEBUG_GUARD_SIZE, DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); | |
| for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xff) { | |
| printf("TRASH BEFORE BLOCK offset %d data %p, read 0x%02x expected 0xff!\n", i, m_data, buf[i] ); | |
| break; | |
| } | |
| cudaMemcpy(buf, rawptr+m_size*sizeof(T), DEBUG_GUARD_SIZE, cudaMemcpyDeviceToHost); | |
| for (int i=0;i<DEBUG_GUARD_SIZE;++i) if (buf[i] != 0xfe) { | |
| printf("TRASH AFTER BLOCK offset %d data %p, read 0x%02x expected 0xfe!\n", i, m_data, buf[i] ); | |
| break; | |
| } | |
| } | |
| void allocate_memory(size_t n_bytes) { | |
| if (n_bytes == 0) { | |
| return; | |
| } | |
| log_debug("GPUMemory: allocating {}.", bytes_to_string(n_bytes)); | |
| uint8_t* rawptr = nullptr; | |
| if (m_managed) { | |
| CUDA_CHECK_THROW(cudaMallocManaged((void**)&rawptr, n_bytes+DEBUG_GUARD_SIZE*2)); | |
| } else { | |
| CUDA_CHECK_THROW(cudaMalloc((void**)&rawptr, n_bytes+DEBUG_GUARD_SIZE*2)); | |
| } | |
| CUDA_CHECK_THROW(cudaMemset(rawptr, 0xff, DEBUG_GUARD_SIZE)); | |
| CUDA_CHECK_THROW(cudaMemset(rawptr + n_bytes + DEBUG_GUARD_SIZE, 0xfe, DEBUG_GUARD_SIZE)); | |
| if (rawptr) rawptr += DEBUG_GUARD_SIZE; | |
| m_data = (T*)(rawptr); | |
| total_n_bytes_allocated() += n_bytes; | |
| } | |
| void free_memory() { | |
| if (!m_data) { | |
| return; | |
| } | |
| uint8_t *rawptr = (uint8_t*)m_data; | |
| if (rawptr) rawptr -= DEBUG_GUARD_SIZE; | |
| CUDA_CHECK_THROW(cudaFree(rawptr)); | |
| total_n_bytes_allocated() -= get_bytes(); | |
| m_data = nullptr; | |
| m_size = 0; | |
| } | |
| /// Frees memory again | |
| TCNN_HOST_DEVICE ~GPUMemory() { | |
| try { | |
| if (m_data) { | |
| free_memory(); | |
| m_size = 0; | |
| } | |
| } catch (const std::runtime_error& error) { | |
| // Don't need to report on memory-free problems when the driver is shutting down. | |
| if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { | |
| log_warning("Could not free memory: {}", error.what()); | |
| } | |
| } | |
| } | |
| /** @name Resizing/enlargement | |
| * @{ | |
| */ | |
| /// Resizes the array to the exact new size, even if it is already larger | |
| void resize(const size_t size) { | |
| if (m_size != size) { | |
| if (m_size) { | |
| try { | |
| free_memory(); | |
| } catch (const std::runtime_error& error) { | |
| throw std::runtime_error{fmt::format("Could not free memory: {}", error.what())}; | |
| } | |
| } | |
| if (size > 0) { | |
| try { | |
| allocate_memory(size * sizeof(T)); | |
| } catch (const std::runtime_error& error) { | |
| throw std::runtime_error{fmt::format("Could not allocate memory: {}", error.what())}; | |
| } | |
| } | |
| m_size = size; | |
| } | |
| } | |
| /// Enlarges the array if its size is smaller | |
| void enlarge(const size_t size) { | |
| if (size > m_size) { | |
| resize(size); | |
| } | |
| } | |
| /** @} */ | |
| /** @name Memset | |
| * @{ | |
| */ | |
| /// Sets the memory of the first num_elements to value | |
| void memset(const int value, const size_t num_elements, const size_t offset = 0) { | |
| if (num_elements + offset > m_size) { | |
| throw std::runtime_error{fmt::format("Could not set memory: Number of elements {}+{} larger than allocated memory {}.", num_elements, offset, m_size)}; | |
| } | |
| CUDA_CHECK_THROW(cudaMemset(m_data + offset, value, num_elements * sizeof(T))); | |
| } | |
| /// Sets the memory of the all elements to value | |
| void memset(const int value) { | |
| memset(value, m_size); | |
| } | |
| /** @} */ | |
| /** @name Copy operations | |
| * @{ | |
| */ | |
| /// Copy data of num_elements from the raw pointer on the host | |
| void copy_from_host(const T* host_data, const size_t num_elements) { | |
| CUDA_CHECK_THROW(cudaMemcpy(data(), host_data, num_elements * sizeof(T), cudaMemcpyHostToDevice)); | |
| } | |
| /// Copy num_elements from the host vector | |
| void copy_from_host(const std::vector<T>& data, const size_t num_elements) { | |
| if (data.size() < num_elements) { | |
| throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}.", num_elements, data.size())}; | |
| } | |
| copy_from_host(data.data(), num_elements); | |
| } | |
| /// Copies data from the raw host pointer to fill the entire array | |
| void copy_from_host(const T* data) { | |
| copy_from_host(data, m_size); | |
| } | |
| /// Copies num_elements of data from the raw host pointer after enlarging the array so that everything fits in | |
| void enlarge_and_copy_from_host(const T* data, const size_t num_elements) { | |
| enlarge(num_elements); | |
| copy_from_host(data, num_elements); | |
| } | |
| /// Copies num_elements from the host vector after enlarging the array so that everything fits in | |
| void enlarge_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) { | |
| enlarge_and_copy_from_host(data.data(), num_elements); | |
| } | |
| /// Copies the entire host vector after enlarging the array so that everything fits in | |
| void enlarge_and_copy_from_host(const std::vector<T>& data) { | |
| enlarge_and_copy_from_host(data.data(), data.size()); | |
| } | |
| /// Copies num_elements of data from the raw host pointer after resizing the array | |
| void resize_and_copy_from_host(const T* data, const size_t num_elements) { | |
| resize(num_elements); | |
| copy_from_host(data, num_elements); | |
| } | |
| /// Copies num_elements from the host vector after resizing the array | |
| void resize_and_copy_from_host(const std::vector<T>& data, const size_t num_elements) { | |
| resize_and_copy_from_host(data.data(), num_elements); | |
| } | |
| /// Copies the entire host vector after resizing the array | |
| void resize_and_copy_from_host(const std::vector<T>& data) { | |
| resize_and_copy_from_host(data.data(), data.size()); | |
| } | |
| /// Copies the entire host vector to the device. Fails if there is not enough space available. | |
| void copy_from_host(const std::vector<T>& data) { | |
| if (data.size() < m_size) { | |
| throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}.", m_size, data.size())}; | |
| } | |
| copy_from_host(data.data(), m_size); | |
| } | |
| /// Copies num_elements of data from the raw host pointer to the device. Fails if there is not enough space available. | |
| void copy_to_host(T* host_data, const size_t num_elements) const { | |
| if (num_elements > m_size) { | |
| throw std::runtime_error{fmt::format("Trying to copy {} elements, but memory size is only {}.", num_elements, m_size)}; | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(host_data, data(), num_elements * sizeof(T), cudaMemcpyDeviceToHost)); | |
| } | |
| /// Copies num_elements from the device to a vector on the host | |
| void copy_to_host(std::vector<T>& data, const size_t num_elements) const { | |
| if (data.size() < num_elements) { | |
| throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}.", num_elements, data.size())}; | |
| } | |
| copy_to_host(data.data(), num_elements); | |
| } | |
| /// Copies num_elements from the device to a raw pointer on the host | |
| void copy_to_host(T* data) const { | |
| copy_to_host(data, m_size); | |
| } | |
| /// Copies all elements from the device to a vector on the host | |
| void copy_to_host(std::vector<T>& data) const { | |
| if (data.size() < m_size) { | |
| throw std::runtime_error{fmt::format("Trying to copy {} elements, but vector size is only {}", m_size, data.size())}; | |
| } | |
| copy_to_host(data.data(), m_size); | |
| } | |
| /// Copies size elements from another device array to this one, automatically resizing it | |
| void copy_from_device(const GPUMemory<T>& other, const size_t size) { | |
| if (size == 0) { | |
| return; | |
| } | |
| if (m_size < size) { | |
| resize(size); | |
| } | |
| CUDA_CHECK_THROW(cudaMemcpy(m_data, other.m_data, size * sizeof(T), cudaMemcpyDeviceToDevice)); | |
| } | |
| /// Copies data from another device array to this one, automatically resizing it | |
| void copy_from_device(const GPUMemory<T> &other) { | |
| copy_from_device(other, other.m_size); | |
| } | |
| // Created an (owned) copy of the data | |
| GPUMemory<T> copy(size_t size) const { | |
| GPUMemory<T> result{size}; | |
| result.copy_from_device(*this); | |
| return result; | |
| } | |
| GPUMemory<T> copy() const { | |
| return copy(m_size); | |
| } | |
| T* data() const { | |
| check_guards(); | |
| return m_data; | |
| } | |
| View view() const { return data(); } | |
| ConstView const_view() const { return view(); } | |
| bool managed() const { | |
| return m_managed; | |
| } | |
| T& at(size_t idx) const { | |
| if (!m_managed) { | |
| throw std::runtime_error{fmt::format("GPUMemory::at() not permitted if not managed.")}; | |
| } | |
| if (idx > m_size) { | |
| throw std::runtime_error{fmt::format("GPUMemory out of bounds: idx={} size={}", idx, m_size)}; | |
| } | |
| return m_data[idx]; | |
| } | |
| TCNN_HOST_DEVICE T& operator[](size_t idx) const { | |
| if (idx > m_size) { | |
| printf("WARNING: buffer overrun of %p at idx %zu\n", idx); | |
| } | |
| return m_data[idx]; | |
| } | |
| TCNN_HOST_DEVICE T& operator[](uint32_t idx) const { | |
| if (idx > m_size) { | |
| printf("WARNING: buffer overrun of %p at idx %u\n", idx); | |
| } | |
| return m_data[idx]; | |
| } | |
| size_t get_num_elements() const { | |
| return m_size; | |
| } | |
| size_t size() const { | |
| return get_num_elements(); | |
| } | |
| size_t get_bytes() const { | |
| return m_size * sizeof(T); | |
| } | |
| size_t n_bytes() const { | |
| return get_bytes(); | |
| } | |
| size_t bytes() const { | |
| return get_bytes(); | |
| } | |
| }; | |
| class GPUMemoryArena { | |
| public: | |
| GPUMemoryArena() { | |
| m_device = cuda_device(); | |
| // Align memory at least by a cache line (128 bytes). | |
| m_alignment = (size_t)128; | |
| m_max_size = previous_multiple(cuda_memory_info().total, cuda_memory_granularity()); | |
| m_free_intervals = {{0, m_max_size}}; | |
| // Reserve an address range that would be sufficient for housing the entire | |
| // available GPU RAM (if nothing else was using the GPU). This is unlikely | |
| // to exhaust all available addresses (even if multiple GPUMemoryArenas are | |
| // used simultaneously), while also ensuring that we never exhaust the | |
| // reserved address range without running out of physical memory beforehand. | |
| if (cuda_supports_virtual_memory() && cuMemAddressReserve(&m_base_address, m_max_size, 0, 0, 0) == CUDA_SUCCESS) { | |
| return; | |
| } | |
| // Use regular memory as fallback | |
| m_fallback_memory = std::make_shared<GPUMemory<uint8_t>>(); | |
| static bool printed_warning = false; | |
| if (!printed_warning) { | |
| printed_warning = true; | |
| log_warning( | |
| "GPUMemoryArena: GPU {} does not support virtual memory. " | |
| "Falling back to regular allocations, which will be larger and can cause occasional stutter.", | |
| m_device | |
| ); | |
| } | |
| } | |
| GPUMemoryArena(GPUMemoryArena&& other) = default; | |
| GPUMemoryArena(const GPUMemoryArena& other) = delete; | |
| GPUMemoryArena& operator=(GPUMemoryArena&& other) = delete; | |
| GPUMemoryArena& operator=(const GPUMemoryArena& other) = delete; | |
| ~GPUMemoryArena() { | |
| if (in_use()) { | |
| log_warning("Attempting to free memory arena while it is still in use."); | |
| } | |
| try { | |
| // Make sure we're clearing the GPU memory arena on the correct device. | |
| int previous_device = cuda_device(); | |
| set_cuda_device(m_device); | |
| ScopeGuard revert_device = {[&]() { set_cuda_device(previous_device); }}; | |
| CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
| if (m_base_address) { | |
| total_n_bytes_allocated() -= m_size; | |
| CU_CHECK_THROW(cuMemUnmap(m_base_address, m_size)); | |
| for (const auto& handle : m_handles) { | |
| CU_CHECK_THROW(cuMemRelease(handle)); | |
| } | |
| CU_CHECK_THROW(cuMemAddressFree(m_base_address, m_max_size)); | |
| } | |
| } catch (const std::runtime_error& error) { | |
| // Don't need to report on memory-free problems when the driver is shutting down. | |
| if (std::string{error.what()}.find("driver shutting down") == std::string::npos) { | |
| log_warning("Could not free memory arena: {}", error.what()); | |
| } | |
| } | |
| } | |
| uint8_t* data() { | |
| return m_fallback_memory ? m_fallback_memory->data() : (uint8_t*)m_base_address; | |
| } | |
| std::shared_ptr<GPUMemory<uint8_t>> backing_memory() { | |
| return m_fallback_memory; | |
| } | |
| // Finds the smallest interval of free memory in the GPUMemoryArena that's | |
| // large enough to hold the requested number of bytes. Then allocates | |
| // that memory. | |
| size_t allocate(size_t n_bytes) { | |
| // Permitting zero-sized allocations is error prone | |
| if (n_bytes == 0) { | |
| n_bytes = m_alignment; | |
| } | |
| // Align allocations with the nearest cache line (at least the granularity of the memory allocations) | |
| n_bytes = next_multiple(n_bytes, m_alignment); | |
| Interval<size_t>* best_candidate = &m_free_intervals.back(); | |
| for (auto& f : m_free_intervals) { | |
| if (f.size() >= n_bytes && f.size() < best_candidate->size()) { | |
| best_candidate = &f; | |
| } | |
| } | |
| size_t start = best_candidate->start; | |
| // Note: the += operator can turn `best_candidate` into an empty interval, which is fine because it will | |
| // be absorbed into adjacent free intervals in later calls to `merge_adjacent_intervals`. | |
| m_allocated_intervals[start] = best_candidate->start += n_bytes; | |
| enlarge(size()); | |
| return start; | |
| } | |
| void free(size_t start) { | |
| if (m_allocated_intervals.count(start) == 0) { | |
| throw std::runtime_error{"Attempted to free arena memory that was not allocated."}; | |
| } | |
| Interval<size_t> interval = {start, m_allocated_intervals[start]}; | |
| m_allocated_intervals.erase(start); | |
| m_free_intervals.insert( | |
| std::upper_bound(std::begin(m_free_intervals), std::end(m_free_intervals), interval), | |
| interval | |
| ); | |
| merge_adjacent_intervals(); | |
| } | |
| void enlarge(size_t n_bytes) { | |
| if (n_bytes <= m_size) { | |
| return; | |
| } | |
| if (cuda_device() != m_device) { | |
| throw std::runtime_error{fmt::format("Attempted to use a GPUMemoryArena of device {} from the wrong device {}.", m_device, cuda_device())}; | |
| } | |
| log_debug("GPUMemoryArena: enlarging from {} to {}", bytes_to_string(m_size), bytes_to_string(n_bytes)); | |
| if (m_fallback_memory) { | |
| static const double GROWTH_FACTOR = 1.5; | |
| CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
| m_size = next_multiple((size_t)(n_bytes * GROWTH_FACTOR), cuda_memory_granularity()); | |
| m_fallback_memory = std::make_shared<GPUMemory<uint8_t>>(m_fallback_memory->copy(m_size)); | |
| CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
| return; | |
| } | |
| size_t n_bytes_to_allocate = n_bytes - m_size; | |
| n_bytes_to_allocate = next_multiple(n_bytes_to_allocate, cuda_memory_granularity()); | |
| CUmemAllocationProp prop = {}; | |
| prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; | |
| prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
| prop.location.id = m_device; | |
| m_handles.emplace_back(); | |
| CU_CHECK_THROW(cuMemCreate(&m_handles.back(), n_bytes_to_allocate, &prop, 0)); | |
| CUmemAccessDesc access_desc = {}; | |
| access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
| access_desc.location.id = prop.location.id; | |
| access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; | |
| CU_CHECK_THROW(cuMemMap(m_base_address + m_size, n_bytes_to_allocate, 0, m_handles.back(), 0)); | |
| CU_CHECK_THROW(cuMemSetAccess(m_base_address + m_size, n_bytes_to_allocate, &access_desc, 1)); | |
| m_size += n_bytes_to_allocate; | |
| total_n_bytes_allocated() += n_bytes_to_allocate; | |
| // Need to synchronize the device to make sure memory is available to all streams. | |
| if (current_capture()) { | |
| current_capture()->schedule_synchronize(); | |
| } else { | |
| CUDA_CHECK_THROW(cudaDeviceSynchronize()); | |
| } | |
| } | |
| size_t size() const { | |
| return m_free_intervals.back().start; | |
| } | |
| bool in_use() const { | |
| return m_free_intervals.size() != 1 || m_free_intervals.front().size() != m_max_size; | |
| } | |
| class Allocation { | |
| public: | |
| Allocation() = default; | |
| Allocation(cudaStream_t stream, size_t offset, const std::shared_ptr<GPUMemoryArena>& workspace) | |
| : m_stream{stream}, m_data{workspace->data() + offset}, m_offset{offset}, m_workspace{workspace}, m_backing_memory{workspace->backing_memory()} | |
| {} | |
| ~Allocation() { | |
| if (m_workspace) { | |
| m_workspace->free(m_offset); | |
| } | |
| } | |
| Allocation(const Allocation& other) = delete; | |
| Allocation& operator=(Allocation&& other) { | |
| std::swap(m_stream, other.m_stream); | |
| std::swap(m_data, other.m_data); | |
| std::swap(m_offset, other.m_offset); | |
| std::swap(m_workspace, other.m_workspace); | |
| std::swap(m_backing_memory, other.m_backing_memory); | |
| return *this; | |
| } | |
| Allocation(Allocation&& other) { | |
| *this = std::move(other); | |
| } | |
| uint8_t* data() { | |
| return m_data; | |
| } | |
| const uint8_t* data() const { | |
| return m_data; | |
| } | |
| cudaStream_t stream() const { | |
| return m_stream; | |
| } | |
| private: | |
| cudaStream_t m_stream = nullptr; | |
| uint8_t* m_data = nullptr; | |
| size_t m_offset = 0; | |
| std::shared_ptr<GPUMemoryArena> m_workspace = nullptr; | |
| // Backing GPUMemory (if backed by a GPUMemory). Ensures that | |
| // the backing memory is only freed once all allocations that | |
| // use it were destroyed. | |
| std::shared_ptr<GPUMemory<uint8_t>> m_backing_memory = nullptr; | |
| }; | |
| private: | |
| void merge_adjacent_intervals() { | |
| size_t j = 0; | |
| for (size_t i = 1; i < m_free_intervals.size(); ++i) { | |
| Interval<size_t>& prev = m_free_intervals[j]; | |
| Interval<size_t>& cur = m_free_intervals[i]; | |
| if (prev.end == cur.start) { | |
| prev.end = cur.end; | |
| } else { | |
| ++j; | |
| m_free_intervals[j] = m_free_intervals[i]; | |
| } | |
| } | |
| m_free_intervals.resize(j+1); | |
| } | |
| std::vector<Interval<size_t>> m_free_intervals; | |
| std::unordered_map<size_t, size_t> m_allocated_intervals; | |
| int m_device = 0; | |
| CUdeviceptr m_base_address = {}; | |
| size_t m_size = 0; | |
| std::vector<CUmemGenericAllocationHandle> m_handles; | |
| // Used then virtual memory isn't supported. | |
| // Requires more storage + memcpy, but is more portable. | |
| std::shared_ptr<GPUMemory<uint8_t>> m_fallback_memory = nullptr; | |
| size_t m_alignment; | |
| size_t m_max_size; | |
| }; | |
| inline std::unordered_map<cudaStream_t, std::shared_ptr<GPUMemoryArena>>& stream_gpu_memory_arenas() { | |
| static auto* stream_gpu_memory_arenas = new std::unordered_map<cudaStream_t, std::shared_ptr<GPUMemoryArena>>{}; | |
| return *stream_gpu_memory_arenas; | |
| } | |
| inline std::unordered_map<int, std::shared_ptr<GPUMemoryArena>>& global_gpu_memory_arenas() { | |
| static auto* global_gpu_memory_arenas = new std::unordered_map<int, std::shared_ptr<GPUMemoryArena>>{}; | |
| return *global_gpu_memory_arenas; | |
| } | |
| inline GPUMemoryArena::Allocation allocate_workspace(cudaStream_t stream, size_t n_bytes) { | |
| if (n_bytes == 0) { | |
| // Return a null allocation if no bytes were requested. | |
| return {}; | |
| } | |
| auto& arena = stream ? stream_gpu_memory_arenas()[stream] : global_gpu_memory_arenas()[cuda_device()]; | |
| if (!arena) { | |
| arena = std::make_shared<GPUMemoryArena>(); | |
| } | |
| return GPUMemoryArena::Allocation{stream, arena->allocate(n_bytes), arena}; | |
| } | |
| inline size_t align_to_cacheline(size_t bytes) { | |
| return next_multiple(bytes, (size_t)128); | |
| } | |
| template <typename First, typename FirstSize> | |
| std::tuple<First*> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size) { | |
| *alloc = allocate_workspace(stream, offset + align_to_cacheline(first_size * sizeof(First))); | |
| return std::make_tuple<First*>((First*)(alloc->data() + offset)); | |
| } | |
| template <typename First, typename ...Types, typename FirstSize, typename ...Sizes, std::enable_if_t<sizeof...(Types) != 0 && sizeof...(Types) == sizeof...(Sizes), int> = 0> | |
| std::tuple<First*, Types*...> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, size_t offset, FirstSize first_size, Sizes... sizes) { | |
| auto nested = allocate_workspace_and_distribute<Types...>(stream, alloc, offset + align_to_cacheline(first_size * sizeof(First)), sizes...); | |
| return std::tuple_cat(std::make_tuple<First*>((First*)(alloc->data() + offset)), nested); | |
| } | |
| template <typename ...Types, typename ...Sizes, std::enable_if_t<sizeof...(Types) == sizeof...(Sizes), int> = 0> | |
| std::tuple<Types*...> allocate_workspace_and_distribute(cudaStream_t stream, GPUMemoryArena::Allocation* alloc, Sizes... sizes) { | |
| return allocate_workspace_and_distribute<Types...>(stream, alloc, (size_t)0, sizes...); | |
| } | |
| inline void free_gpu_memory_arena(cudaStream_t stream) { | |
| if (stream) { | |
| stream_gpu_memory_arenas().erase(stream); | |
| } else { | |
| global_gpu_memory_arenas().erase(cuda_device()); | |
| } | |
| } | |
| inline void free_all_gpu_memory_arenas() { | |
| stream_gpu_memory_arenas().clear(); | |
| global_gpu_memory_arenas().clear(); | |
| } | |
| } | |