Phi2-Fine-Tuning / phivenv /Lib /site-packages /torch /include /c10 /cuda /CUDADeviceAssertionHost.h

Add files using upload-large-folder tool

d1d4335 verified 3 months ago

6.8 kB

	#pragma once

	#include <c10/cuda/CUDAMacros.h>

	#include <cstdint>
	#include <memory>
	#include <mutex>
	#include <string>
	#include <utility>
	#include <vector>

	#ifdef USE_CUDA
	#define TORCH_USE_CUDA_DSA
	#endif

	/// Number of assertion failure messages we can store. If this is too small
	/// threads will fail silently.
	constexpr int C10_CUDA_DSA_ASSERTION_COUNT = 10;
	constexpr int C10_CUDA_DSA_MAX_STR_LEN = 512;

	namespace c10::cuda {

	/// Holds information about any device-side assertions that fail.
	/// Held in managed memory and access by both the CPU and the GPU.
	struct DeviceAssertionData {
	/// Stringification of the assertion
	// NOLINTNEXTLINE(*-c-arrays)
	char assertion_msg[C10_CUDA_DSA_MAX_STR_LEN]{};
	/// File the assertion was in
	// NOLINTNEXTLINE(*-c-arrays)
	char filename[C10_CUDA_DSA_MAX_STR_LEN]{};
	/// Name of the function the assertion was in
	// NOLINTNEXTLINE(*-c-arrays)
	char function_name[C10_CUDA_DSA_MAX_STR_LEN]{};
	/// Line number the assertion was at
	int line_number{};
	/// Number uniquely identifying the kernel launch that triggered the assertion
	uint32_t caller{};
	/// block_id of the thread that failed the assertion
	// NOLINTNEXTLINE(*-c-arrays)
	int32_t block_id[3]{};
	/// third_id of the thread that failed the assertion
	// NOLINTNEXTLINE(*-c-arrays)
	int32_t thread_id[3]{};
	};

	/// Used to hold assertions generated by the device
	/// Held in managed memory and access by both the CPU and the GPU.
	struct DeviceAssertionsData {
	/// Total number of assertions found; a subset of these will be recorded
	/// in `assertions`
	int32_t assertion_count{};
	/// An array of assertions that will be written to in a race-free manner
	// NOLINTNEXTLINE(*-c-arrays)
	DeviceAssertionData assertions[C10_CUDA_DSA_ASSERTION_COUNT]{};
	};

	/// Use to hold info about kernel launches so that we can run kernels
	/// asynchronously and still associate launches with device-side
	/// assertion failures
	struct CUDAKernelLaunchInfo {
	/// Filename of the code where the kernel was launched from
	const char* launch_filename;
	/// Function from which the kernel was launched
	const char* launch_function;
	/// Line number of where the code was launched from
	uint32_t launch_linenum;
	/// Backtrace of where the kernel was launched from, only populated if
	/// CUDAKernelLaunchRegistry::gather_launch_stacktrace is True
	std::string launch_stacktrace;
	/// Kernel that was launched
	const char* kernel_name;
	/// Device the kernel was launched on
	int device;
	/// Stream the kernel was launched on
	int32_t stream;
	/// A number that uniquely identifies the kernel launch
	uint64_t generation_number;
	};

	/// Circular buffer used to hold information about kernel launches
	/// this is later used to reconstruct how a device-side kernel assertion failure
	/// occurred CUDAKernelLaunchRegistry is used as a singleton
	class C10_CUDA_API CUDAKernelLaunchRegistry {
	private:
	/// Assume that this is the max number of kernel launches that might ever be
	/// enqueued across all streams on a single device
	static constexpr int max_kernel_launches = 1024;
	/// How many kernel launch infos we've inserted. Used to ensure that circular
	/// queue doesn't provide false information by always increasing, but also to
	/// mark where we are inserting into the queue
	#ifdef TORCH_USE_CUDA_DSA
	uint64_t generation_number = 0;
	#endif
	/// Shared mutex between writer and accessor to ensure multi-threaded safety.
	mutable std::mutex read_write_mutex;
	/// Used to ensure prevent race conditions in GPU memory allocation
	mutable std::mutex gpu_alloc_mutex;
	/// Pointer to managed memory keeping track of device-side assertions. There
	/// is one entry for each possible device the process might work with. Unused
	/// entries are nullptrs. We could also use an unordered_set here, but this
	/// vector design will be faster and the wasted memory is small since we
	/// expect the number of GPUs per node will always be small
	std::vector<
	std::unique_ptr<DeviceAssertionsData, void ()(DeviceAssertionsData)>>
	uvm_assertions;
	/// A single circular buffer holds information about every kernel launch the
	/// process makes across all devices.
	std::vector<CUDAKernelLaunchInfo> kernel_launches;
	bool check_env_for_enable_launch_stacktracing() const;
	bool check_env_for_dsa_enabled() const;

	public:
	CUDAKernelLaunchRegistry();
	/// Register a new kernel launch and obtain a generation number back to be
	/// passed to the kernel
	uint32_t insert(
	const char* launch_filename,
	const char* launch_function,
	const uint32_t launch_linenum,
	const char* kernel_name,
	const int32_t stream_id);
	/// Get copies of the kernel launch registry and each device's assertion
	/// failure buffer so they can be inspected without raising race conditions
	std::
	pair<std::vector<DeviceAssertionsData>, std::vector<CUDAKernelLaunchInfo>>
	snapshot() const;
	/// Get a pointer to the current device's assertion failure buffer. If no such
	/// buffer exists then one is created. This means that the first kernel launch
	/// made on each device will be slightly slower because memory allocations are
	/// required
	DeviceAssertionsData* get_uvm_assertions_ptr_for_current_device();
	/// Gets the global singleton of the registry
	static CUDAKernelLaunchRegistry& get_singleton_ref();
	/// If not all devices support DSA, we disable it
	const bool do_all_devices_support_managed_memory = false;
	/// Whether or not to gather stack traces when launching kernels
	bool gather_launch_stacktrace = false;
	/// Whether or not host-side DSA is enabled or disabled at run-time
	/// Note: Device-side code cannot be enabled/disabled at run-time
	bool enabled_at_runtime = false;
	/// Whether or not a device has indicated a failure
	bool has_failed() const;
	#ifdef TORCH_USE_CUDA_DSA
	const bool enabled_at_compile_time = true;
	#else
	const bool enabled_at_compile_time = false;
	#endif
	};

	C10_CUDA_API std::string c10_retrieve_device_side_assertion_info();

	} // namespace c10::cuda

	// Each kernel launched with TORCH_DSA_KERNEL_LAUNCH
	// requires the same input arguments. We introduce the following macro to
	// standardize these.
	#define TORCH_DSA_KERNEL_ARGS \
	[[maybe_unused]] c10::cuda::DeviceAssertionsData *const assertions_data, \
	[[maybe_unused]] uint32_t assertion_caller_id

	// This macro can be used to pass the DSA arguments onward to another
	// function
	#define TORCH_DSA_KERNEL_ARGS_PASS assertions_data, assertion_caller_id