|
|
#pragma once
|
|
|
|
|
|
#include <c10/core/CachingDeviceAllocator.h>
|
|
|
#include <c10/cuda/CUDAGraphsC10Utils.h>
|
|
|
#include <c10/cuda/CUDAMacros.h>
|
|
|
#include <c10/cuda/CUDAStream.h>
|
|
|
#include <c10/util/ApproximateClock.h>
|
|
|
#include <c10/util/Exception.h>
|
|
|
#include <c10/util/Registry.h>
|
|
|
|
|
|
#include <atomic>
|
|
|
#include <cstddef>
|
|
|
#include <cstdint>
|
|
|
#include <functional>
|
|
|
#include <memory>
|
|
|
#include <string>
|
|
|
#include <unordered_set>
|
|
|
#include <utility>
|
|
|
|
|
|
namespace c10 {
|
|
|
|
|
|
|
|
|
|
|
|
class C10_CUDA_API FreeMemoryCallback {
|
|
|
public:
|
|
|
virtual ~FreeMemoryCallback() = default;
|
|
|
virtual bool Execute() = 0;
|
|
|
};
|
|
|
|
|
|
C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
|
|
|
#define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
|
|
|
C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__)
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace c10::cuda::CUDACachingAllocator {
|
|
|
|
|
|
|
|
|
|
|
|
using c10::CachingDeviceAllocator::DeviceStats;
|
|
|
|
|
|
extern const size_t kLargeBuffer;
|
|
|
|
|
|
typedef std::shared_ptr<GatheredContext> (*CreateContextFn)();
|
|
|
|
|
|
|
|
|
|
|
|
struct BlockInfo {
|
|
|
size_t size = 0;
|
|
|
size_t requested_size = 0;
|
|
|
int32_t gc_counter = 0;
|
|
|
bool allocated = false;
|
|
|
bool active = false;
|
|
|
std::shared_ptr<GatheredContext>
|
|
|
context_when_allocated;
|
|
|
};
|
|
|
|
|
|
|
|
|
struct SegmentInfo {
|
|
|
c10::DeviceIndex device = 0;
|
|
|
size_t address = 0;
|
|
|
size_t total_size = 0;
|
|
|
size_t requested_size = 0;
|
|
|
size_t allocated_size = 0;
|
|
|
size_t active_size = 0;
|
|
|
cudaStream_t stream = nullptr;
|
|
|
bool is_large = false;
|
|
|
bool is_expandable = false;
|
|
|
MempoolId_t owner_private_pool_id = {0, 0};
|
|
|
std::vector<BlockInfo> blocks;
|
|
|
std::shared_ptr<GatheredContext> context_when_allocated;
|
|
|
};
|
|
|
|
|
|
struct AllocatorState {
|
|
|
virtual ~AllocatorState() = default;
|
|
|
};
|
|
|
|
|
|
union trace_time_ {
|
|
|
time_t t_;
|
|
|
approx_time_t approx_t_;
|
|
|
};
|
|
|
|
|
|
struct TraceEntry {
|
|
|
enum Action {
|
|
|
ALLOC,
|
|
|
FREE_REQUESTED,
|
|
|
FREE_COMPLETED,
|
|
|
|
|
|
|
|
|
SEGMENT_ALLOC,
|
|
|
SEGMENT_FREE,
|
|
|
|
|
|
SEGMENT_MAP,
|
|
|
SEGMENT_UNMAP,
|
|
|
SNAPSHOT,
|
|
|
|
|
|
OOM
|
|
|
|
|
|
};
|
|
|
TraceEntry(
|
|
|
Action action,
|
|
|
c10::DeviceIndex device,
|
|
|
size_t addr,
|
|
|
size_t size,
|
|
|
cudaStream_t stream,
|
|
|
MempoolId_t mempool,
|
|
|
approx_time_t time,
|
|
|
std::shared_ptr<GatheredContext> context = nullptr,
|
|
|
std::string compile_context = "")
|
|
|
: action_(action),
|
|
|
device_(device),
|
|
|
addr_(addr),
|
|
|
context_(std::move(context)),
|
|
|
stream_(stream),
|
|
|
size_(size),
|
|
|
mempool_(std::move(mempool)),
|
|
|
compile_context_(std::move(compile_context)) {
|
|
|
time_.approx_t_ = time;
|
|
|
}
|
|
|
Action action_;
|
|
|
c10::DeviceIndex device_;
|
|
|
size_t addr_;
|
|
|
std::shared_ptr<GatheredContext> context_;
|
|
|
cudaStream_t stream_{};
|
|
|
size_t size_;
|
|
|
MempoolId_t mempool_;
|
|
|
trace_time_ time_{};
|
|
|
std::string compile_context_{};
|
|
|
};
|
|
|
|
|
|
|
|
|
struct AnnotationEntry {
|
|
|
AnnotationEntry(c10::DeviceIndex device, approx_time_t time)
|
|
|
: device_(device) {
|
|
|
time_.approx_t_ = time;
|
|
|
}
|
|
|
|
|
|
void recordUserMetadata(const std::string& name, std::string value) {
|
|
|
metadata_[name] = std::move(value);
|
|
|
}
|
|
|
|
|
|
c10::DeviceIndex device_;
|
|
|
trace_time_ time_{};
|
|
|
std::unordered_map<std::string, std::string> metadata_;
|
|
|
};
|
|
|
|
|
|
struct AllocatorConfigInfo {
|
|
|
double garbage_collection_threshold;
|
|
|
size_t max_split_size;
|
|
|
size_t pinned_num_register_threads;
|
|
|
bool expandable_segments;
|
|
|
bool release_lock_on_malloc;
|
|
|
bool pinned_use_host_register;
|
|
|
std::string last_allocator_settings;
|
|
|
std::vector<size_t> roundup_power2_divisions;
|
|
|
};
|
|
|
|
|
|
struct SnapshotInfo {
|
|
|
std::vector<SegmentInfo> segments;
|
|
|
std::vector<std::vector<TraceEntry>> device_traces;
|
|
|
std::vector<AnnotationEntry> external_annotations;
|
|
|
AllocatorConfigInfo config_metadata;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct CheckpointDelta {
|
|
|
std::vector<void*> ptrs_freed;
|
|
|
std::vector<at::DataPtr> dataptrs_allocd;
|
|
|
};
|
|
|
|
|
|
enum struct RecordContext {
|
|
|
NEVER = 0,
|
|
|
STATE = 1,
|
|
|
ALLOC = 2,
|
|
|
ALL = 3,
|
|
|
};
|
|
|
|
|
|
using OutOfMemoryObserver = std::function<void(
|
|
|
int64_t device,
|
|
|
size_t allocated,
|
|
|
size_t device_total,
|
|
|
size_t device_free)>;
|
|
|
|
|
|
using AllocatorTraceTracker = std::function<void(const TraceEntry&)>;
|
|
|
|
|
|
struct ShareableHandle {
|
|
|
ptrdiff_t offset;
|
|
|
std::string handle;
|
|
|
};
|
|
|
|
|
|
class CUDAAllocator : public Allocator {
|
|
|
public:
|
|
|
virtual void* raw_alloc(size_t nbytes) = 0;
|
|
|
virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
|
|
|
virtual void raw_delete(void* ptr) = 0;
|
|
|
virtual void init(int device_count) = 0;
|
|
|
virtual bool initialized() = 0;
|
|
|
virtual double getMemoryFraction(c10::DeviceIndex device) = 0;
|
|
|
virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
|
|
|
virtual void emptyCache(MempoolId_t mempool_id = {0, 0}) = 0;
|
|
|
virtual void enable(bool value) = 0;
|
|
|
virtual bool isEnabled() const = 0;
|
|
|
virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
|
|
|
virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
|
|
|
virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
|
|
|
virtual c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
|
|
c10::DeviceIndex device) = 0;
|
|
|
virtual void resetAccumulatedStats(c10::DeviceIndex device) = 0;
|
|
|
virtual void resetPeakStats(c10::DeviceIndex device) = 0;
|
|
|
virtual SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) = 0;
|
|
|
virtual void beginAllocateToPool(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t mempool_id,
|
|
|
std::function<bool(cudaStream_t)> filter) = 0;
|
|
|
virtual void endAllocateToPool(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t mempool_id) = 0;
|
|
|
virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
|
|
|
virtual int getPoolUseCount(
|
|
|
c10::DeviceIndex ,
|
|
|
MempoolId_t ) {
|
|
|
TORCH_CHECK(
|
|
|
false,
|
|
|
name(),
|
|
|
" does not yet support getPoolUseCount. "
|
|
|
"If you need it, please file an issue describing your use case.");
|
|
|
}
|
|
|
virtual void createOrIncrefPool(
|
|
|
c10::DeviceIndex ,
|
|
|
MempoolId_t ,
|
|
|
CUDAAllocator* allocator = nullptr) {
|
|
|
TORCH_CHECK(
|
|
|
false,
|
|
|
name(),
|
|
|
" does not yet support createOrIncrefPool. "
|
|
|
"If you need it, please file an issue describing your use case.");
|
|
|
}
|
|
|
virtual void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
|
|
TORCH_CHECK(
|
|
|
false,
|
|
|
name(),
|
|
|
" does not yet support setUseOnOOM. "
|
|
|
"If you need it, please file an issue describing your use case.");
|
|
|
}
|
|
|
|
|
|
|
|
|
virtual bool checkPoolLiveAllocations(
|
|
|
c10::DeviceIndex ,
|
|
|
MempoolId_t ,
|
|
|
const std::unordered_set<void*>& ) {
|
|
|
TORCH_CHECK(
|
|
|
false,
|
|
|
name(),
|
|
|
" does not yet support checkPoolLiveAllocations. "
|
|
|
"If you need it, please file an issue describing your use case.");
|
|
|
}
|
|
|
virtual ShareableHandle shareIpcHandle(void* ptr) = 0;
|
|
|
virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
|
|
|
virtual bool isHistoryEnabled() {
|
|
|
TORCH_CHECK(
|
|
|
false,
|
|
|
name(),
|
|
|
" does not yet support recordHistory. "
|
|
|
"If you need it, please file an issue describing your use case.");
|
|
|
}
|
|
|
virtual void recordHistory(
|
|
|
bool enabled,
|
|
|
CreateContextFn context_recorder,
|
|
|
size_t alloc_trace_max_entries,
|
|
|
RecordContext when,
|
|
|
bool clearHistory) = 0;
|
|
|
virtual void recordAnnotation(
|
|
|
const std::vector<std::pair<std::string, std::string>>& ) {}
|
|
|
virtual void pushCompileContext(std::string& md) {}
|
|
|
virtual void popCompileContext() {}
|
|
|
virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) = 0;
|
|
|
|
|
|
virtual void enablePeerAccess(
|
|
|
c10::DeviceIndex dev,
|
|
|
c10::DeviceIndex dev_to_access) = 0;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual cudaError_t memcpyAsync(
|
|
|
void* dst,
|
|
|
int dstDevice,
|
|
|
const void* src,
|
|
|
int srcDevice,
|
|
|
size_t count,
|
|
|
cudaStream_t stream,
|
|
|
bool p2p_enabled) = 0;
|
|
|
virtual std::shared_ptr<AllocatorState> getCheckpointState(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t id) = 0;
|
|
|
virtual CheckpointDelta setCheckpointPoolState(
|
|
|
c10::DeviceIndex device,
|
|
|
std::shared_ptr<AllocatorState> pps) = 0;
|
|
|
virtual std::string name() = 0;
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
|
|
|
|
|
|
inline CUDAAllocator* get() {
|
|
|
return allocator.load();
|
|
|
}
|
|
|
|
|
|
|
|
|
inline void* raw_alloc(size_t nbytes) {
|
|
|
return get()->raw_alloc(nbytes);
|
|
|
}
|
|
|
|
|
|
inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
|
|
|
return get()->raw_alloc_with_stream(nbytes, stream);
|
|
|
}
|
|
|
|
|
|
inline void raw_delete(void* ptr) {
|
|
|
return get()->raw_delete(ptr);
|
|
|
}
|
|
|
|
|
|
inline void init(int device_count) {
|
|
|
return get()->init(device_count);
|
|
|
}
|
|
|
|
|
|
inline double getMemoryFraction(c10::DeviceIndex device) {
|
|
|
return get()->getMemoryFraction(device);
|
|
|
}
|
|
|
|
|
|
inline void setMemoryFraction(double fraction, c10::DeviceIndex device) {
|
|
|
return get()->setMemoryFraction(fraction, device);
|
|
|
}
|
|
|
|
|
|
inline void emptyCache(MempoolId_t mempool_id = {0, 0}) {
|
|
|
return get()->emptyCache(mempool_id);
|
|
|
}
|
|
|
|
|
|
inline void enable(bool value) {
|
|
|
return get()->enable(value);
|
|
|
}
|
|
|
|
|
|
inline bool isEnabled() {
|
|
|
return get()->isEnabled();
|
|
|
}
|
|
|
|
|
|
inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
|
|
|
return get()->cacheInfo(device, largestBlock);
|
|
|
}
|
|
|
|
|
|
inline void* getBaseAllocation(void* ptr, size_t* size) {
|
|
|
return get()->getBaseAllocation(ptr, size);
|
|
|
}
|
|
|
|
|
|
inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
|
|
|
return get()->recordStream(dataPtr, stream);
|
|
|
}
|
|
|
|
|
|
inline c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
|
|
|
c10::DeviceIndex device) {
|
|
|
return get()->getDeviceStats(device);
|
|
|
}
|
|
|
|
|
|
inline void resetAccumulatedStats(c10::DeviceIndex device) {
|
|
|
return get()->resetAccumulatedStats(device);
|
|
|
}
|
|
|
|
|
|
inline void resetPeakStats(c10::DeviceIndex device) {
|
|
|
return get()->resetPeakStats(device);
|
|
|
}
|
|
|
|
|
|
inline SnapshotInfo snapshot(MempoolId_t mempool_id = {0, 0}) {
|
|
|
return get()->snapshot(mempool_id);
|
|
|
}
|
|
|
|
|
|
inline std::shared_ptr<AllocatorState> getCheckpointState(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t id) {
|
|
|
return get()->getCheckpointState(device, id);
|
|
|
}
|
|
|
|
|
|
inline CheckpointDelta setCheckpointPoolState(
|
|
|
c10::DeviceIndex device,
|
|
|
std::shared_ptr<AllocatorState> pps) {
|
|
|
return get()->setCheckpointPoolState(device, std::move(pps));
|
|
|
}
|
|
|
|
|
|
|
|
|
inline void beginAllocateToPool(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t mempool_id,
|
|
|
std::function<bool(cudaStream_t)> filter) {
|
|
|
get()->beginAllocateToPool(device, mempool_id, std::move(filter));
|
|
|
}
|
|
|
|
|
|
inline void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
|
|
get()->endAllocateToPool(device, mempool_id);
|
|
|
}
|
|
|
|
|
|
inline void recordHistory(
|
|
|
bool enabled,
|
|
|
CreateContextFn context_recorder,
|
|
|
size_t alloc_trace_max_entries,
|
|
|
RecordContext when,
|
|
|
bool clearHistory) {
|
|
|
return get()->recordHistory(
|
|
|
enabled, context_recorder, alloc_trace_max_entries, when, clearHistory);
|
|
|
}
|
|
|
|
|
|
inline void recordAnnotation(
|
|
|
const std::vector<std::pair<std::string, std::string>>& md) {
|
|
|
return get()->recordAnnotation(md);
|
|
|
}
|
|
|
|
|
|
inline void pushCompileContext(std::string& md) {
|
|
|
return get()->pushCompileContext(md);
|
|
|
}
|
|
|
|
|
|
inline void popCompileContext() {
|
|
|
return get()->popCompileContext();
|
|
|
}
|
|
|
|
|
|
inline bool isHistoryEnabled() {
|
|
|
return get()->isHistoryEnabled();
|
|
|
}
|
|
|
|
|
|
inline bool checkPoolLiveAllocations(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t mempool_id,
|
|
|
const std::unordered_set<void*>& expected_live_allocations) {
|
|
|
return get()->checkPoolLiveAllocations(
|
|
|
device, mempool_id, expected_live_allocations);
|
|
|
}
|
|
|
|
|
|
inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
|
|
|
return get()->attachOutOfMemoryObserver(std::move(observer));
|
|
|
}
|
|
|
|
|
|
inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
|
|
|
return get()->attachAllocatorTraceTracker(std::move(tracker));
|
|
|
}
|
|
|
|
|
|
inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
|
|
return get()->releasePool(device, mempool_id);
|
|
|
}
|
|
|
inline void createOrIncrefPool(
|
|
|
c10::DeviceIndex device,
|
|
|
MempoolId_t mempool_id,
|
|
|
CUDAAllocator* allocator_ptr = nullptr) {
|
|
|
get()->createOrIncrefPool(device, mempool_id, allocator_ptr);
|
|
|
}
|
|
|
inline void setUseOnOOM(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
|
|
get()->setUseOnOOM(device, mempool_id);
|
|
|
}
|
|
|
|
|
|
inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
|
|
return get()->getPoolUseCount(device, mempool_id);
|
|
|
}
|
|
|
|
|
|
|
|
|
inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
|
|
|
return get()->getIpcDevPtr(std::move(handle));
|
|
|
}
|
|
|
|
|
|
inline ShareableHandle shareIpcHandle(void* ptr) {
|
|
|
return get()->shareIpcHandle(ptr);
|
|
|
}
|
|
|
|
|
|
inline std::string name() {
|
|
|
return get()->name();
|
|
|
}
|
|
|
|
|
|
inline cudaError_t memcpyAsync(
|
|
|
void* dst,
|
|
|
int dstDevice,
|
|
|
const void* src,
|
|
|
int srcDevice,
|
|
|
size_t count,
|
|
|
cudaStream_t stream,
|
|
|
bool p2p_enabled) {
|
|
|
return get()->memcpyAsync(
|
|
|
dst, dstDevice, src, srcDevice, count, stream, p2p_enabled);
|
|
|
}
|
|
|
|
|
|
inline void enablePeerAccess(
|
|
|
c10::DeviceIndex dev,
|
|
|
c10::DeviceIndex dev_to_access) {
|
|
|
return get()->enablePeerAccess(dev, dev_to_access);
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
namespace c10::cuda {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct C10_CUDA_API MemPool {
|
|
|
MemPool(
|
|
|
CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
|
|
|
bool is_user_created = true,
|
|
|
bool use_on_oom = false,
|
|
|
bool symmetric = false);
|
|
|
MemPool(const MemPool&) = delete;
|
|
|
MemPool(MemPool&&) = default;
|
|
|
MemPool& operator=(const MemPool&) = delete;
|
|
|
MemPool& operator=(MemPool&&) = default;
|
|
|
~MemPool();
|
|
|
|
|
|
MempoolId_t id();
|
|
|
bool is_symmetric();
|
|
|
CUDACachingAllocator::CUDAAllocator* allocator();
|
|
|
int use_count();
|
|
|
c10::DeviceIndex device();
|
|
|
static MempoolId_t graph_pool_handle(bool is_user_created = true);
|
|
|
|
|
|
private:
|
|
|
static std::atomic<CaptureId_t> uid_;
|
|
|
static std::atomic<CaptureId_t> uuid_;
|
|
|
CUDACachingAllocator::CUDAAllocator* allocator_;
|
|
|
bool is_user_created_;
|
|
|
MempoolId_t id_;
|
|
|
bool symmetric_;
|
|
|
c10::DeviceIndex device_;
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|