Add files using upload-large-folder tool

c1af2fa verified 3 months ago

7.17 kB

	// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states.
	// These handles are tied to device, and these libraries requires/recommends not to
	// share handles across host threads.
	//
	// These libraries recommend using one handle per host thread. We may not want to do
	// this because threads are relatively light-weight, but creating and destroying
	// handles is expensive (destroying the handle causes synchronizations). DataParallel,
	// for example, creates new threads for each forward pass.
	//
	// This file implements a handle pool mechanism. The handle pool returns handles on
	// demand as threads request them. If all existing handles in the pool are in use,
	// it creates a new one. As threads terminate, they release handles back into the pool.
	// In this way, the handle pool never creates more handles than the high-water mark of
	// active threads, so it's efficient with DataParallel.

	#pragma once

	#include <unordered_map>
	#include <vector>
	#include <utility>
	#include <mutex>
	#include <memory>

	#include <c10/util/Exception.h>

	namespace at::cuda { namespace {

	template <typename Handle_t, void Create(Handle_t *), void Destroy(Handle_t)>
	struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThreadHandlePool<Handle_t, Create, Destroy>> {

	struct Handle {
	Handle_t handle;
	Handle(bool create = false) : handle(nullptr)
	{
	if(create) Create(&handle);
	}
	// std::vector.emplace() and push_back() may route through temporaries and call
	// copy/move constructors along the way. If this is the case, we don't want
	// the destructors of temporaries to call cudnnDestroy on the handle.
	// We can achieve safety (for the narrow case of stashing within std::vectors)
	// by making Handle moveable but not copyable, and transferring handle ownership
	// to the latest constructed object. This is not a substitute for full-blown
	// reference counting, but reference counting may be overkill here.
	// Another alternative is to wrap the saved Handles in unique_ptrs, i.e.,
	// unordered_map<int, vector<unique_ptr<Handle>>> created_handles;
	Handle(const Handle& rhs) = delete;
	// Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom
	Handle(Handle&& rhs) noexcept : Handle() { std::swap(handle, rhs.handle); }
	// operator= takes argument by value
	Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; }
	~Handle() {
	if(handle) Destroy(handle);
	}
	};

	std::mutex mutex;

	// Handles are lazily created as different threads request them,
	// but are never destroyed until the end of the process.
	// The maximum number of handles this process will create for each device is equal
	// to the high-water mark of the number of concurrently active threads that request
	// handles for that device.
	// When threads terminate, they release their handles back into the pool for reuse.
	// Otherwise, new handles would be created every time new threads were spawned,
	// resulting in poor performance for Python modules that repeatedly or frequently
	// spawned new sets of threads (like DataParallel, which creates a new set of threads
	// for each forward pass).
	//
	// To prevent potential deadlocks, we explicitly choose not to cap the number
	// of handles that are created per device.
	// Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device,
	// only 4 can make forward progress at any time. The other 4 will not release their
	// handles until they exit, so the fifth cannot make progress until then. This is
	// not a problem...UNLESS all 5 threads attempt some sort of synchronization at an
	// intermediate point (ie, before any of them have exited). We have no way to anticipate
	// or enforce that user threads will not attempt such intermediate synchronization.
	// The only way to ensure safety is to avoid imposing a cap on the number of handles.
	std::unordered_map<int, std::vector<Handle>> created_handles;
	std::unordered_map<int, std::vector<Handle_t>> available_handles;

	// PoolWindow lazily creates and caches the handles that a particular thread is using,
	// so in the common case handle access doesn't incur either handle creation or a mutex lock.
	class PoolWindow
	{
	public:
	PoolWindow(std::shared_ptr<DeviceThreadHandlePool> parent): weak_parent(std::move(parent)) {}
	~PoolWindow(){ release(); }

	Handle_t reserve(int device)
	{
	// If this thread already has a handle for this device, return it
	if(my_handles.find(device) != my_handles.end())
	return my_handles[device];

	// otherwise, either grab a handle from the pool if one is available,
	// or if not, create a new one.
	auto parent = weak_parent.lock();
	TORCH_CHECK(parent, "Cannot create handle during program termination");
	std::lock_guard<std::mutex> guard(parent->mutex);

	if(parent->available_handles[device].size() > 0)
	{
	my_handles[device] = parent->available_handles[device].back();
	parent->available_handles[device].pop_back();
	}
	else
	{
	// In local testing, I do observe that emplace_back sometimes routes through temporaries
	// that incur move-constructor and destructor calls. See comments in Handle above.
	parent->created_handles[device].emplace_back(true /create/);
	my_handles[device] = parent->created_handles[device].back().handle;
	}

	return my_handles[device];
	}

	private:
	// Stores the per-device handles currently owned by this thread
	std::unordered_map<int, Handle_t> my_handles;

	std::weak_ptr<DeviceThreadHandlePool> weak_parent;

	// Called by the destructor. Releases this thread's handles back into the pool.
	void release() {
	if(my_handles.size() > 0) {
	auto parent = weak_parent.lock();
	if (!parent) {
	// If this thread exits after atexit handlers have completed, the
	// cuda context itself may be invalid, so we must leak the handles.
	return;
	}

	std::lock_guard<std::mutex> guard(parent->mutex);
	for(auto d_h : my_handles)
	parent->available_handles[d_h.first].push_back(d_h.second);
	}
	}
	};

	// Warning:
	// If you want to change this function, be aware that this function will be called
	// by multiple threads and there is no mutex guarding the call of this function, so
	// make sure your implementation is thread-safe.
	PoolWindow *newPoolWindow() {
	// The returned pointer will be owned by a thread local variable
	// so that different threads does not share the same PoolWindow.
	return new PoolWindow(this->shared_from_this());
	}
	};

	}} // namespace at::cuda::detail::<anonymous>