|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
#include <ATen/Tensor.h>
|
|
|
#include <ATen/mps/MPSAllocatorInterface.h>
|
|
|
#include <ATen/mps/MPSStream.h>
|
|
|
|
|
|
#include <os/log.h>
|
|
|
#include <os/signpost.h>
|
|
|
|
|
|
#include <atomic>
|
|
|
#include <ctime>
|
|
|
#include <sstream>
|
|
|
#include <string>
|
|
|
#include <unordered_map>
|
|
|
#include <utility>
|
|
|
|
|
|
#ifndef __OBJC__
|
|
|
typedef void* MTLCaptureManager;
|
|
|
#endif
|
|
|
|
|
|
namespace at::mps {
|
|
|
|
|
|
namespace Profiler {
|
|
|
|
|
|
struct BaseInfo {
|
|
|
|
|
|
enum class Type {
|
|
|
GRAPH,
|
|
|
KERNEL,
|
|
|
COPY,
|
|
|
CPU_FALLBACK,
|
|
|
};
|
|
|
|
|
|
BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle)
|
|
|
: type(infoType), profileId(Id), handle(Handle) {}
|
|
|
virtual ~BaseInfo() = default;
|
|
|
|
|
|
|
|
|
Type type;
|
|
|
|
|
|
uint64_t profileId;
|
|
|
|
|
|
|
|
|
|
|
|
os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0;
|
|
|
|
|
|
|
|
|
std::atomic<double> totalGpuTime{0.0};
|
|
|
|
|
|
|
|
|
std::atomic<double> totalSchedulingTime{0.0};
|
|
|
|
|
|
std::atomic_bool completed{false};
|
|
|
|
|
|
const uintptr_t handle;
|
|
|
|
|
|
virtual const std::string toString(
|
|
|
double gpuTime = 0,
|
|
|
double schedulingTime = 0) const;
|
|
|
|
|
|
static std::string buildTensorString(
|
|
|
const Tensor& tensor,
|
|
|
bool includeBufferId = false);
|
|
|
static uint64_t getTime() {
|
|
|
return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW);
|
|
|
}
|
|
|
};
|
|
|
|
|
|
struct OperationInfo : BaseInfo {
|
|
|
OperationInfo(
|
|
|
const void* Handle,
|
|
|
bool IsGraph,
|
|
|
uint64_t Id,
|
|
|
const std::string& StrKey)
|
|
|
: BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)),
|
|
|
strKey(StrKey) {}
|
|
|
|
|
|
uint64_t runCount = 0;
|
|
|
std::string strKey;
|
|
|
|
|
|
const std::string toString(double gpuTime = 0, double schedulingTime = 0)
|
|
|
const override;
|
|
|
|
|
|
|
|
|
static std::string buildKernelString(
|
|
|
const std::string& kernelName,
|
|
|
const TensorList& tensors,
|
|
|
bool includeBufferId = false) {
|
|
|
std::stringstream kernelStr;
|
|
|
kernelStr << kernelName;
|
|
|
for (const Tensor& tensor : tensors) {
|
|
|
kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId);
|
|
|
}
|
|
|
return kernelStr.str();
|
|
|
}
|
|
|
};
|
|
|
|
|
|
struct CpuFbInfo : BaseInfo {
|
|
|
CpuFbInfo(uint64_t Id, const std::string& OpName)
|
|
|
: BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) {}
|
|
|
|
|
|
uint64_t runCount = 0;
|
|
|
|
|
|
|
|
|
size_t currentCopyOverhead = 0;
|
|
|
size_t totalCopyOverhead = 0;
|
|
|
std::string opName;
|
|
|
std::string strKey;
|
|
|
uint64_t startTime = 0;
|
|
|
|
|
|
const std::string toString(double gpuTime = 0, double schedulingTime = 0)
|
|
|
const override;
|
|
|
|
|
|
void updateCopyOverhead(const TensorList& tensors) {
|
|
|
currentCopyOverhead = 0;
|
|
|
for (const Tensor& tensor : tensors) {
|
|
|
if (tensor.defined()) {
|
|
|
currentCopyOverhead += tensor.nbytes();
|
|
|
}
|
|
|
}
|
|
|
totalCopyOverhead += currentCopyOverhead;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
struct CopyInfo : BaseInfo {
|
|
|
enum class Kind {
|
|
|
MPS_TO_MPS,
|
|
|
MPS_TO_CPU,
|
|
|
CPU_TO_MPS,
|
|
|
};
|
|
|
|
|
|
CopyInfo(
|
|
|
const void* Handle,
|
|
|
size_t Length,
|
|
|
uint64_t Id,
|
|
|
bool IsNonBlocking,
|
|
|
bool UsesBlitter)
|
|
|
: BaseInfo(Type::COPY, Id, uintptr_t(Handle)),
|
|
|
kind(Kind::MPS_TO_MPS),
|
|
|
length(Length),
|
|
|
isNonBlocking(IsNonBlocking),
|
|
|
usesBlitter(UsesBlitter) {}
|
|
|
|
|
|
Kind kind;
|
|
|
size_t length;
|
|
|
bool isNonBlocking;
|
|
|
bool usesBlitter;
|
|
|
std::string srcStrKey;
|
|
|
std::string dstStrKey;
|
|
|
|
|
|
uint64_t startTime = 0;
|
|
|
|
|
|
const std::string toString(double gpuTime = 0, double schedulingTime = 0)
|
|
|
const override;
|
|
|
|
|
|
static std::string buildTensorString(
|
|
|
const void* buffer,
|
|
|
const OptionalTensorRef tensor,
|
|
|
bool includeBufferId = false);
|
|
|
|
|
|
static bool isStorageOnMPS(
|
|
|
const void* buffer,
|
|
|
const OptionalTensorRef tensor) {
|
|
|
if (tensor.has_value()) {
|
|
|
return tensor->device().type() == at::kMPS;
|
|
|
}
|
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer);
|
|
|
|
|
|
return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0;
|
|
|
}
|
|
|
|
|
|
static Kind getCopyKind(
|
|
|
const void* srcBuffer,
|
|
|
const void* dstBuffer,
|
|
|
const OptionalTensorRef srcTensor,
|
|
|
const OptionalTensorRef dstTensor) {
|
|
|
const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor);
|
|
|
const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor);
|
|
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS);
|
|
|
if (isSrcOnMPS && !isDstOnMPS) {
|
|
|
return Kind::MPS_TO_CPU;
|
|
|
} else if (!isSrcOnMPS && isDstOnMPS) {
|
|
|
return Kind::CPU_TO_MPS;
|
|
|
}
|
|
|
return Kind::MPS_TO_MPS;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
struct CopyStat : CopyInfo {
|
|
|
explicit CopyStat(std::string CopyKindStr)
|
|
|
: CopyInfo(nullptr, 0, 0, false, false),
|
|
|
kindStr(std::move(CopyKindStr)) {}
|
|
|
|
|
|
size_t totalCount = 0;
|
|
|
|
|
|
size_t scalarsCount = 0;
|
|
|
|
|
|
size_t blockingCount = 0;
|
|
|
|
|
|
size_t memcpyCount = 0;
|
|
|
|
|
|
std::atomic<double> scalarsGpuTime{0.0};
|
|
|
|
|
|
std::string kindStr;
|
|
|
};
|
|
|
|
|
|
class MPSProfiler {
|
|
|
public:
|
|
|
|
|
|
enum ProfileOptions : uint32_t {
|
|
|
OPTIONS_NONE = 0,
|
|
|
|
|
|
|
|
|
|
|
|
ALL_SIGNPOST_EVENTS = (1 << 0),
|
|
|
|
|
|
ALL_SIGNPOST_INTERVALS = (1 << 1),
|
|
|
|
|
|
WAIT_UNTIL_COMPLETED = (1 << 2),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
INCLUDE_SCHEDULE_INTERVAL = (1 << 3),
|
|
|
|
|
|
|
|
|
|
|
|
USE_INTERVALS = (1 << 4),
|
|
|
|
|
|
USE_EVENTS = (1 << 5),
|
|
|
|
|
|
OPTIONS_COUNT = (USE_EVENTS << 1) - 1,
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
enum SignpostTypes : uint32_t {
|
|
|
SIGNPOST_NONE = 0,
|
|
|
|
|
|
RUN_OPERATION = (1 << 16),
|
|
|
|
|
|
BLIT_COPY = (1 << 17),
|
|
|
|
|
|
CPU_FALLBACK = (1 << 18),
|
|
|
|
|
|
SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1,
|
|
|
};
|
|
|
|
|
|
enum LogOptions : uint32_t {
|
|
|
LOG_NONE = 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
OPERATION_INFO = (1 << 0),
|
|
|
|
|
|
COPY_INFO = (1 << 1),
|
|
|
|
|
|
|
|
|
CPU_FALLBACK_INFO = (1 << 2),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ALL_STATS = (1 << 3),
|
|
|
|
|
|
|
|
|
OPERATION_STATS = (1 << 4),
|
|
|
|
|
|
|
|
|
COPY_STATS = (1 << 5),
|
|
|
|
|
|
|
|
|
CPU_FALLBACK_STATS = (1 << 6),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
INCLUDE_GPU_TIME = (1 << 7),
|
|
|
|
|
|
|
|
|
|
|
|
INCLUDE_KERNEL_TIME = (1 << 8),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
INCLUDE_BUFFER_ID = (1 << 9),
|
|
|
|
|
|
|
|
|
LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1,
|
|
|
};
|
|
|
|
|
|
explicit MPSProfiler();
|
|
|
~MPSProfiler();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint64_t beginProfileKernel(
|
|
|
const void* handle,
|
|
|
const std::string& strKey,
|
|
|
bool isGraph);
|
|
|
uint64_t beginProfileKernel(
|
|
|
const void* handle,
|
|
|
const std::string& kernelName,
|
|
|
const TensorList& tensors);
|
|
|
uint64_t beginProfileCopy(
|
|
|
const void* srcBuffer,
|
|
|
const void* dstBuffer,
|
|
|
const OptionalTensorRef srcTensor,
|
|
|
const OptionalTensorRef dstTensor,
|
|
|
size_t length,
|
|
|
bool isNonBlocking,
|
|
|
bool usesBlitter = true);
|
|
|
uint64_t beginProfileCPUFallback(
|
|
|
const std::string& opName,
|
|
|
const TensorList& tensors);
|
|
|
void beginProfileGPUInterval(const void* handle);
|
|
|
|
|
|
void endProfileCopy(uint64_t profileId, SyncType syncType);
|
|
|
void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE);
|
|
|
void endProfileCPUFallback(const std::string& opName);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void StartTrace(const std::string& mode, bool waitUntilCompleted);
|
|
|
void StopTrace();
|
|
|
|
|
|
|
|
|
bool isCaptureEnabled() const;
|
|
|
bool isCapturing() const;
|
|
|
void startCapture(const std::string& name, MPSStream* stream = nullptr);
|
|
|
void stopCapture(MPSStream* stream = nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
bool isOperationProfilingEnabled() const {
|
|
|
return (m_signpost_types & SignpostTypes::RUN_OPERATION) ||
|
|
|
(m_log_options &
|
|
|
(LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS));
|
|
|
}
|
|
|
bool isCopyProfilingEnabled() const {
|
|
|
return (m_signpost_types & SignpostTypes::BLIT_COPY) ||
|
|
|
(m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS));
|
|
|
}
|
|
|
bool isCPUFallbackProfilingEnabled() const {
|
|
|
return (m_signpost_types & SignpostTypes::CPU_FALLBACK) ||
|
|
|
(m_log_options &
|
|
|
(LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS));
|
|
|
}
|
|
|
bool isSignpostTracingEnabled() const {
|
|
|
return (m_signpost_types != SignpostTypes::SIGNPOST_NONE);
|
|
|
}
|
|
|
|
|
|
private:
|
|
|
|
|
|
|
|
|
uint32_t m_signpost_types = 0;
|
|
|
uint32_t m_profile_options = 0;
|
|
|
uint32_t m_log_options = 0;
|
|
|
uint64_t m_kernel_counter = 0;
|
|
|
uint64_t m_graph_counter = 0;
|
|
|
uint64_t m_cpu_fb_counter = 0;
|
|
|
uint64_t m_copy_counter = 0;
|
|
|
|
|
|
|
|
|
os_log_t m_os_log_events;
|
|
|
os_log_t m_os_log_intervals;
|
|
|
|
|
|
|
|
|
std::atomic_bool hasLoggedStats{false};
|
|
|
|
|
|
|
|
|
std::atomic_bool hasPendingCompletionHandlers{false};
|
|
|
|
|
|
static struct sigaction currentSigint, previousSigint;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::unordered_map<uintptr_t, std::unique_ptr<OperationInfo>>
|
|
|
m_op_info_list{};
|
|
|
|
|
|
|
|
|
|
|
|
std::unordered_map<std::string, std::unique_ptr<CpuFbInfo>>
|
|
|
m_cpu_fb_info_list{};
|
|
|
|
|
|
|
|
|
|
|
|
std::unordered_map<uint64_t, std::unique_ptr<CopyInfo>> m_copy_info_list{};
|
|
|
|
|
|
std::unordered_map<CopyInfo::Kind, std::unique_ptr<CopyStat>>
|
|
|
m_copy_stat_list{};
|
|
|
|
|
|
mutable MTLCaptureManager* captureManager = nil;
|
|
|
unsigned captureCount = 0;
|
|
|
|
|
|
void initialize();
|
|
|
void beginProfileExecution(BaseInfo& info, bool cpuExecution = false);
|
|
|
void endProfileExecution(
|
|
|
BaseInfo& info,
|
|
|
os_signpost_id_t event_signpost_id,
|
|
|
os_signpost_id_t interval_signpost_id,
|
|
|
double gpuTime,
|
|
|
double schedulingTime);
|
|
|
void addProfilerScheduledHandler(BaseInfo& info);
|
|
|
void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType);
|
|
|
void emitSignpostEvent(
|
|
|
SignpostTypes signpost_type,
|
|
|
os_signpost_id_t signpost_id,
|
|
|
const std::string& msg) const;
|
|
|
void beginSignpostInterval(
|
|
|
SignpostTypes signpost_type,
|
|
|
os_signpost_id_t signpost_id,
|
|
|
const std::string& msg) const;
|
|
|
void endSignpostInterval(
|
|
|
SignpostTypes signpost_type,
|
|
|
os_signpost_id_t signpost_id) const;
|
|
|
|
|
|
void updateCopyStats(
|
|
|
const CopyInfo& copyInfo,
|
|
|
double gpuTime,
|
|
|
double schedulingTime);
|
|
|
|
|
|
|
|
|
bool isProfileInfoLoggingEnabled(
|
|
|
BaseInfo::Type infoType,
|
|
|
bool isExecutionEnded);
|
|
|
|
|
|
void logProfilingStats();
|
|
|
|
|
|
void logOperationsProfilingStats(std::FILE* f) const;
|
|
|
|
|
|
void logCPUFallbackProfilingStats(std::FILE* f) const;
|
|
|
|
|
|
void logCopyProfilingStats(std::FILE* f) const;
|
|
|
|
|
|
os_signpost_id_t generateSignpostId(
|
|
|
os_signpost_type_t signpostType,
|
|
|
const void* ptr = nullptr);
|
|
|
static SignpostTypes getSignpostType(BaseInfo::Type infoType);
|
|
|
static void handleIntSignal(int signal);
|
|
|
};
|
|
|
|
|
|
}
|
|
|
|
|
|
Profiler::MPSProfiler& getMPSProfiler();
|
|
|
|
|
|
}
|
|
|
|