File size: 18,993 Bytes
c1af2fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 |
// Copyright © 2022 Apple Inc.
#pragma once
#include <ATen/mps/MPSAllocatorInterface.h>
#include <ATen/mps/MPSEvent.h>
#include <ATen/mps/MPSStream.h>
#include <c10/util/flat_hash_map.h>
#include <mach/vm_page_size.h>
#include <cstdio>
#include <mutex>
#include <set>
#include <unordered_set>
// this implementation is based on CUDACachingAllocator.
// It utilizes Metal Heaps to improve the performance with buffer allocation.
// Do not include this header. Use MPSAllocatorInterface.h instead.
// TODO: Unify the logic with CUDACachingAllocator and remove redundant code.
namespace at::mps::HeapAllocator {
static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB
static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap
static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB
static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps
static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps
static const size_t kXLargeHeapD =
MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps
static const size_t kXLargeHeapU =
MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps
static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation
// buffer pools could be customized with a combination of usage flags
enum UsageFlags : uint32_t {
PRIVATE = 0,
SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap
SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device
MANAGED = (1 << 2), // managed storage mode
HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool
SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream
};
// debug verbosity flags
enum DebugVerbosity : uint32_t {
SILENT = 0,
PROFILING = (1 << 0), // print generic profiling data for total system memory usage
ALLOCATIONS = (1 << 1), // print buffer allocations
RECYCLES = (1 << 2), // print buffer recycling
RELEASES = (1 << 3), // print buffer releases
LARGE_ONLY = (1 << 4), // only log large buffer pool transactions
};
struct HeapBlock;
struct BufferBlock {
id<MTLBuffer> buffer;
void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer
size_t size; // size after alignment
size_t requested_size; // requested size (before alignment)
// buffer shape is used for retrieving base of views in cached graphs
std::vector<int64_t> shape;
bool in_use = false;
HeapBlock* heap;
id_t buf_id;
// counter to candidate least recently used buffers for garbage collection
uint32_t gc_count = 0;
uint32_t use_count = 0;
// counter to assign unique ids to buffer blocks
static uint64_t buffer_counter;
// Metal events used to sync GPU/CPU operations on the shared-storage buffers
MPSEventPtr event;
BufferBlock(size_t Size, size_t RequestedSize = 0, const id<MTLBuffer> Buffer = nullptr, HeapBlock* Heap = nullptr)
: buffer(Buffer), size(Size), requested_size(RequestedSize), heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) {}
static bool Comparator(const BufferBlock* a, const BufferBlock* b) {
return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer;
}
static size_t alignUp(size_t Size, size_t Alignment) {
assert(((Alignment - 1) & Alignment) == 0);
return ((Size + Alignment - 1) & ~(Alignment - 1));
}
uint32_t retainCount() const {
return [buffer retainCount];
}
};
typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*);
struct BufferPool;
struct AllocParams {
AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool)
: search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) {}
size_t size() const {
return search_key.size;
}
BufferBlock search_key;
BufferPool* pool;
BufferBlock* buffer_block = nullptr;
size_t requested_size;
// true if we exceed the low watermark limit. In this case
// we apply strategies to relieve the pressure before allocation.
bool has_memory_pressure = false;
// true if we're allocating on a unified memory device
bool has_unified_memory = true;
};
struct HeapBlock {
id<MTLHeap> heap;
struct {
size_t total, available;
} size;
BufferPool* pool;
unsigned int n_buffers = 0;
id_t heap_id;
// indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer)
bool is_split;
// counter to assign unique ids to heap blocks
static uint64_t heap_counter;
HeapBlock(size_t Size, const id<MTLHeap> Heap = nullptr, BufferPool* Pool = nullptr)
: heap(Heap),
size({.total = Size, .available = Size}),
pool(Pool),
heap_id(Heap ? ++heap_counter : 0),
is_split(true) {}
static MTLResourceOptions getOptions(uint32_t usage) {
// TODO: check the caching performance of write-combined mode
MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache;
if (usage & UsageFlags::MANAGED)
options |= MTLResourceStorageModeManaged;
else if (usage & UsageFlags::SHARED)
options |= MTLResourceStorageModeShared;
else
options |= MTLResourceStorageModePrivate;
options |=
(usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked;
return options;
}
static HeapBlock* createHeapBlock(AllocParams& params, id<MTLDevice> device, uint32_t usage) {
HeapBlock* heapBlock = nullptr;
bool is_split = true;
const size_t size = params.size();
MTLHeapDescriptor* d = [MTLHeapDescriptor new];
if (d) {
const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD;
if (size <= kMaxSmallAlloc) {
d.size = kSmallHeap;
} else if (size < kMinLargeAlloc) {
d.size = kLargeHeap;
} else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) {
d.size = kXLargeHeap;
} else {
d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge);
is_split = false;
}
d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate;
d.cpuCacheMode = MTLCPUCacheModeDefaultCache;
// this automatically handles Metal buffer access synchronizations at the
// cost of slightly lower performance.
d.hazardTrackingMode =
(usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked;
d.resourceOptions = getOptions(usage);
d.type = MTLHeapTypeAutomatic;
id<MTLHeap> heap = [device newHeapWithDescriptor:d];
if (heap) {
[heap setPurgeableState:MTLPurgeableStateNonVolatile];
const size_t heap_size = heapAvailableSize(heap);
heapBlock = new HeapBlock(heap_size, heap, params.pool);
if (heapBlock) {
heapBlock->is_split = is_split;
}
}
[d release];
}
return heapBlock;
}
static bool Comparator(const HeapBlock* a, const HeapBlock* b) {
return (a->size.available != b->size.available) ? a->size.available < b->size.available
: (uintptr_t)a->heap < (uintptr_t)b->heap;
}
static NSUInteger heapAvailableSize(id<MTLHeap> heap, size_t Alignment = vm_page_size) {
return [heap maxAvailableSizeWithAlignment:Alignment];
}
NSUInteger Size() {
return [heap size];
}
id<MTLBuffer> newMTLBuffer(size_t length, uint32_t usage) {
id<MTLBuffer> buf = [heap newBufferWithLength:length options:getOptions(usage)];
if (buf) {
updateAvailableSize();
n_buffers++;
}
return buf;
}
// returns the retainCount before releasing the buffer
uint32_t releaseMTLBuffer(id<MTLBuffer>& buffer) {
const uint32_t retainCount = [buffer retainCount];
[buffer release];
buffer = nil;
updateAvailableSize();
n_buffers--;
return retainCount;
}
// returns the retainCount before releasing the heap
uint32_t releaseMTLHeap() {
const uint32_t retainCount = [heap retainCount];
TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty
[heap setPurgeableState:MTLPurgeableStateEmpty];
[heap release];
heap = nil;
size.available = 0;
return retainCount;
}
uint32_t retainCount() const {
return [heap retainCount];
}
void updateAvailableSize() {
size.available = heapAvailableSize(heap);
}
};
typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*);
struct BufferPool {
enum class Kind {
PRIVATE_SMALL,
PRIVATE_LARGE,
SHARED_SMALL,
SHARED_LARGE,
SCALAR,
};
BufferPool(const id<MTLDevice> Device, uint32_t Usage)
: device(Device), usage(Usage), heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) {}
const id<MTLDevice> device;
// usage flags to customize the pool for various purposes (see UsageFlags enum)
const uint32_t usage;
// total number of buffers in the pool
uint32_t n_buffers = 0;
// total allocations size on this pool
size_t allocated_size = 0;
// total memory available in the pool
size_t available_size = 0;
// list of heaps ordered by their "available" (not total) memory size
std::set<HeapBlock*, HeapComparison> heaps;
// list of only "available" buffers in the pool (i.e., buffers not in-use)
std::set<BufferBlock*, BufferComparison> available_buffers;
// list of buffers that are in a state of "limbo" where they've already been freed
// from PyTorch-side, but were not returned to pool due to still being
// in-use by command buffers with retainCount > 1. In this state, the buffer is
// neither ready to be recycled, nor could be returned to pool as available.
// These buffers will be returned to pool once the command buffer's
// completionHandler callbacks are called.
std::unordered_set<BufferBlock*> buffers_pending_free;
// list of heaps pending size update
std::unordered_set<HeapBlock*> heaps_pending_update;
};
class MPSHeapAllocatorImpl {
public:
explicit MPSHeapAllocatorImpl()
: m_device(at::mps::MPSDevice::getInstance()->device()),
m_max_buffer_size([m_device maxBufferLength]),
m_stream(getDefaultMPSStream()),
m_event_pool(getMPSEventPool()) {
init_allocator();
}
~MPSHeapAllocatorImpl() {
emptyCache();
}
// interface exposed to at::Allocator
id<MTLBuffer> malloc(size_t size, uint32_t usage);
// frees a buffer and returns it into buffer pool
void free(void* ptr);
// releases all the cached buffers and their associated heaps
void emptyCache();
// free inactive buffers that are pending to be freed
void freeInactiveBuffers();
// returns true if buffer was allocated from the shared pool
bool isSharedBuffer(const void* ptr);
// get the requested unaligned size of an MTLBuffer
ssize_t getUnalignedBufferSize(const void* ptr);
// set the shape of a base tensor from a view tensor
void setBufferShape(const void* ptr, const IntArrayRef& shape);
// retrieve the shape of a base tensor from a view tensor
IntArrayRef getBufferShape(const void* ptr);
// get the unique ID of the buffer
id_t getBufferId(const void* ptr);
// allocate a buffer from a specialized pool to import CPU scalars into GPU
id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
// returns a CPU-mapping of the input buffer and its retainCount,
// if only it has Shared storage-mode and allocated on MPSAllocator
std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
// records events for a list of MTLBuffers (list is used to lock the mutex once)
// returns true if records any event (given if passed buffers exist and are shared-storage)
bool recordEvents(c10::ArrayRef<const void*> buffers);
// waits for the event to signal the completion of GPU execution
// on the passed shared buffers (list is used to lock the mutex once)
// returns true if actually waited on any event
bool waitForEvents(c10::ArrayRef<const void*> buffers);
// this indicates how far (in Megabytes) the current total allocations are from the
// low watermark limit which is used to detect if we're under memory pressure
// This returns zero if we've reached the low watermark limit
ssize_t getLowWatermarkValue();
// (see m_low_watermark_ratio for description)
void setLowWatermarkRatio(double ratio);
// (see m_high_watermark_ratio for description)
void setHighWatermarkRatio(double ratio);
// (see m_low_watermark_limit for description)
size_t getLowWatermarkLimit() const {
return m_low_watermark_limit;
}
// (see m_max_total_allowed_size for description)
size_t getHighWatermarkLimit() const {
return m_max_total_allowed_size;
}
// (see m_total_allocated_memory for description)
size_t getTotalAllocatedMemory() const {
return m_total_allocated_memory;
}
// (see m_current_allocated_memory for description)
size_t getCurrentAllocatedMemory() const {
return m_current_allocated_memory;
}
// total GPU memory allocated in the process by Metal driver; including
// implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
size_t getDriverAllocatedMemory() const {
return current_allocated_size();
}
// recommended Max memory for Metal
size_t getRecommendedMaxMemory() const {
return max_device_size();
}
// (see enum DebugVerbosity for description)
uint32_t getDebugVerbosity() const {
return m_debug_verbosity;
}
// returns the device that we allocate from
inline id<MTLDevice> Device() const {
return m_device;
}
inline std::string format_size(uint64_t size) const;
private:
// (see m_high_watermark_ratio for description)
constexpr static double default_high_watermark_ratio = 1.7;
// we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
constexpr static double default_high_watermark_upper_bound = 2.0;
// (see m_low_watermark_ratio for description)
// on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
constexpr static double default_low_watermark_ratio_unified = 1.4;
constexpr static double default_low_watermark_ratio_discrete = 1.0;
const id<MTLDevice> m_device;
std::recursive_mutex m_mutex;
// allocated buffers by device pointer
ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
// using a container for pools to simplify iterating them
ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
// total memory allocated by HeapAllocator (including blocks in pools)
size_t m_total_allocated_memory = 0;
// currently active memory allocations in use (i.e., blocks not in pools)
size_t m_current_allocated_memory = 0;
// max buffer size allowed by Metal
size_t m_max_buffer_size = 0;
// maximum total size allowed to be allocated
size_t m_max_total_allowed_size = 0;
// high watermark ratio is a hard limit for the total allowed allocations
// 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
// 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
// >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
// e.g., value 0.95 means we allocate up to 95% of recommended maximum
// allocation size; beyond that, the allocations would fail with OOM error.
double m_high_watermark_ratio;
// low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
// level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
// Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
// e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
// allocation size.
double m_low_watermark_ratio;
// low watermark size limit (in Bytes) at the time we initialize the allocator
size_t m_low_watermark_limit;
// use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
uint32_t m_debug_verbosity;
// default MPS stream
MPSStream* m_stream;
// we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
std::shared_ptr<MPSEventPool> m_event_pool;
void init_allocator();
void init_buffer_pools();
HeapBlock* get_free_heap(AllocParams& params);
bool get_free_buffer(AllocParams& params);
BufferBlock* get_allocated_buffer_block(const void* ptr);
BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
bool alloc_buffer(AllocParams& params);
void free_buffer(BufferBlock* buffer_block);
// returns true if the container heap is also released
bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
void release_buffers(BufferPool& pool);
bool release_available_cached_buffers(AllocParams& params);
bool release_cached_buffers();
// free unused cached blocks to reclaim GPU memory if memory pressure is high
void garbage_collect_cached_buffers(AllocParams& params);
// returns the suitable buffer pool type for the usage or
// requested/allocated sizes
BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
// returns the aligned allocation size that is optimized
// for the buffers to get reused frequently
size_t get_allocation_size(size_t size, uint32_t usage) const;
// maximum size of device memory available for allocation in current process
// Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
size_t max_device_size() const {
return [m_device recommendedMaxWorkingSetSize];
}
// there are implicit allocations from MPS backend, so we need to query the 'device' for
// total allocated size instead of manually tracking in MPSAllocator
size_t current_allocated_size() const {
return [m_device currentAllocatedSize];
}
bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(
buffer_block ? buffer_block->buffer : nullptr, event);
}
return true;
}
};
} // namespace at::mps::HeapAllocator
|