#ifndef CAFFE_SYNCEDMEM_HPP_ #define CAFFE_SYNCEDMEM_HPP_ #include #ifdef USE_MKL #include "mkl.h" #endif #include "caffe/common.hpp" namespace caffe { // If CUDA is available and in GPU mode, host memory will be allocated pinned, // using cudaMallocHost. It avoids dynamic pinning for transfers (DMA). // The improvement in performance seems negligible in the single GPU case, // but might be more significant for parallel training. Most importantly, // it improved stability for large models on many GPUs. inline void CaffeMallocHost(void** ptr, size_t size, bool* use_cuda) { #ifndef CPU_ONLY if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaMallocHost(ptr, size)); *use_cuda = true; return; } #endif #ifdef USE_MKL *ptr = mkl_malloc(size ? size:1, 64); #else *ptr = malloc(size); #endif *use_cuda = false; CHECK(*ptr) << "host allocation of size " << size << " failed"; } inline void CaffeFreeHost(void* ptr, bool use_cuda) { #ifndef CPU_ONLY if (use_cuda) { CUDA_CHECK(cudaFreeHost(ptr)); return; } #endif #ifdef USE_MKL mkl_free(ptr); #else free(ptr); #endif } /** * @brief Manages memory allocation and synchronization between the host (CPU) * and device (GPU). * * TODO(dox): more thorough description. */ class SyncedMemory { public: SyncedMemory(); explicit SyncedMemory(size_t size); ~SyncedMemory(); const void* cpu_data(); void set_cpu_data(void* data); const void* gpu_data(); void set_gpu_data(void* data); void* mutable_cpu_data(); void* mutable_gpu_data(); enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED }; SyncedHead head() const { return head_; } size_t size() const { return size_; } #ifndef CPU_ONLY void async_gpu_push(const cudaStream_t& stream); #endif private: void check_device(); void to_cpu(); void to_gpu(); void* cpu_ptr_; void* gpu_ptr_; size_t size_; SyncedHead head_; bool own_cpu_data_; bool cpu_malloc_use_cuda_; bool own_gpu_data_; int device_; DISABLE_COPY_AND_ASSIGN(SyncedMemory); }; // class SyncedMemory } // namespace caffe #endif // CAFFE_SYNCEDMEM_HPP_