diff --git "a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h" "b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h" new file mode 100644--- /dev/null +++ "b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h" @@ -0,0 +1,4784 @@ +/* + * Copyright 2011-2023 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(_CUPTI_ACTIVITY_DEPRECATED_H_) +#define _CUPTI_ACTIVITY_DEPRECATED_H_ + +#if defined(__cplusplus) +extern "C" { +#endif + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility push(default) +#endif + +/** + * \brief The kinds of activity records. + * + * Each activity record kind represents information about a GPU or an + * activity occurring on a CPU or GPU. Each kind is associated with a + * activity record structure that holds the information associated + * with the kind. + * \see CUpti_ActivityOverhead + * \see CUpti_ActivityOverhead2 + * \see CUpti_ActivityDevice + * \see CUpti_ActivityDevice2 + * \see CUpti_ActivityDevice3 + * \see CUpti_ActivityDevice4 + * \see CUpti_ActivityKernel + * \see CUpti_ActivityKernel2 + * \see CUpti_ActivityKernel3 + * \see CUpti_ActivityKernel4 + * \see CUpti_ActivityKernel5 + * \see CUpti_ActivityKernel6 + * \see CUpti_ActivityKernel7 + * \see CUpti_ActivityKernel8 + * \see CUpti_ActivityMemcpy + * \see CUpti_ActivityMemcpy3 + * \see CUpti_ActivityMemcpy4 + * \see CUpti_ActivityMemcpyPtoP + * \see CUpti_ActivityMemcpyPtoP2 + * \see CUpti_ActivityMemcpyPtoP3 + * \see CUpti_ActivityMemset + * \see CUpti_ActivityMemset2 + * \see CUpti_ActivityMemset3 + * \see CUpti_ActivityMemory2 + * \see CUpti_ActivityMemoryPool + * \see CUpti_ActivityMarker + * \see CUpti_ActivityGlobalAccess + * \see CUpti_ActivityGlobalAccess2 + * \see CUpti_ActivityBranch + * \see CUpti_ActivityPCSampling + * \see CUpti_ActivityPCSampling2 + * \see CUpti_ActivityUnifiedMemoryCounter + * \see CUpti_ActivityNvLink + * \see CUpti_ActivityNvLink2 + * \see CUpti_ActivityNvLink3 + */ + +/** + * \brief The activity record for CUPTI and driver overheads. + * (Deprecated in CUDA 12.2) + * + * This activity record provides CUPTI and driver overhead information + * (CUPTI_ACTIVITY_OVERHEAD). These records are now reported using + * CUpti_ActivityOverhead3 + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD. + */ + CUpti_ActivityKind kind; + + /** + * The kind of overhead, CUPTI, DRIVER, COMPILER etc. + */ + CUpti_ActivityOverheadKind overheadKind; + + /** + * The kind of activity object that the overhead is associated with. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object. 'objectKind' indicates + * which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + + /** + * The start timestamp for the overhead, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the overhead. + */ + uint64_t start; + + /** + * The end timestamp for the overhead, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the overhead. + */ + uint64_t end; +} CUpti_ActivityOverhead; + +/** + * \brief The activity record for CUPTI and driver overheads. + * + * This activity record provides CUPTI and driver overhead information + * (CUPTI_ACTIVITY_OVERHEAD). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD. + */ + CUpti_ActivityKind kind; + + /** + * The kind of overhead, CUPTI, DRIVER, COMPILER etc. + */ + CUpti_ActivityOverheadKind overheadKind; + + /** + * The kind of activity object that the overhead is associated with. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object. 'objectKind' indicates + * which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + + /** + * The start timestamp for the overhead, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the overhead. + */ + uint64_t start; + + /** + * The end timestamp for the overhead, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the overhead. + */ + uint64_t end; + + /** + * The correlation ID of the overhead operation to which + * records belong to. This ID is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the overhead operation. + * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records. + */ + uint32_t correlationId; + + /** + * Reserved for internal use. + */ + uint32_t reserved0; +} CUpti_ActivityOverhead2; + +/** + * \brief The activity record for a device. (deprecated) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice5 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; +} CUpti_ActivityDevice; + +/** + * \brief The activity record for a device. (deprecated) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice5 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum amount of shared memory available per multiprocessor, in bytes. + */ + uint32_t maxSharedMemoryPerMultiprocessor; + + /** + * Maximum number of 32-bit registers available per multiprocessor. + */ + uint32_t maxRegistersPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + + /** + * ECC enabled flag for device + */ + uint32_t eccEnabled; + + /** + * The device UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid uuid; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; +} CUpti_ActivityDevice2; + +/** + * \brief The activity record for a device. (CUDA 7.0 onwards) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice5 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum amount of shared memory available per multiprocessor, in bytes. + */ + uint32_t maxSharedMemoryPerMultiprocessor; + + /** + * Maximum number of 32-bit registers available per multiprocessor. + */ + uint32_t maxRegistersPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + + /** + * ECC enabled flag for device + */ + uint32_t eccEnabled; + + /** + * The device UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid uuid; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; + + /** + * Flag to indicate whether the device is visible to CUDA. Users can + * set the device visibility using CUDA_VISIBLE_DEVICES environment + */ + uint8_t isCudaVisible; + + uint8_t reserved[7]; +} CUpti_ActivityDevice3; + +/** + * \brief The activity record for a device. (CUDA 11.6 onwards) + * + * This activity record represents information about a GPU device + * (CUPTI_ACTIVITY_KIND_DEVICE). + * Device activity is now reported using the + * CUpti_ActivityDevice5 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the device. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The global memory bandwidth available on the device, in + * kBytes/sec. + */ + uint64_t globalMemoryBandwidth; + + /** + * The amount of global memory on the device, in bytes. + */ + uint64_t globalMemorySize; + + /** + * The amount of constant memory on the device, in bytes. + */ + uint32_t constantMemorySize; + + /** + * The size of the L2 cache on the device, in bytes. + */ + uint32_t l2CacheSize; + + /** + * The number of threads per warp on the device. + */ + uint32_t numThreadsPerWarp; + + /** + * The core clock rate of the device, in kHz. + */ + uint32_t coreClockRate; + + /** + * Number of memory copy engines on the device. + */ + uint32_t numMemcpyEngines; + + /** + * Number of multiprocessors on the device. + */ + uint32_t numMultiprocessors; + + /** + * The maximum "instructions per cycle" possible on each device + * multiprocessor. + */ + uint32_t maxIPC; + + /** + * Maximum number of warps that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxWarpsPerMultiprocessor; + + /** + * Maximum number of blocks that can be present on a multiprocessor + * at any given time. + */ + uint32_t maxBlocksPerMultiprocessor; + + /** + * Maximum amount of shared memory available per multiprocessor, in bytes. + */ + uint32_t maxSharedMemoryPerMultiprocessor; + + /** + * Maximum number of 32-bit registers available per multiprocessor. + */ + uint32_t maxRegistersPerMultiprocessor; + + /** + * Maximum number of registers that can be allocated to a block. + */ + uint32_t maxRegistersPerBlock; + + /** + * Maximum amount of shared memory that can be assigned to a block, + * in bytes. + */ + uint32_t maxSharedMemoryPerBlock; + + /** + * Maximum number of threads allowed in a block. + */ + uint32_t maxThreadsPerBlock; + + /** + * Maximum allowed X dimension for a block. + */ + uint32_t maxBlockDimX; + + /** + * Maximum allowed Y dimension for a block. + */ + uint32_t maxBlockDimY; + + /** + * Maximum allowed Z dimension for a block. + */ + uint32_t maxBlockDimZ; + + /** + * Maximum allowed X dimension for a grid. + */ + uint32_t maxGridDimX; + + /** + * Maximum allowed Y dimension for a grid. + */ + uint32_t maxGridDimY; + + /** + * Maximum allowed Z dimension for a grid. + */ + uint32_t maxGridDimZ; + + /** + * Compute capability for the device, major number. + */ + uint32_t computeCapabilityMajor; + + /** + * Compute capability for the device, minor number. + */ + uint32_t computeCapabilityMinor; + + /** + * The device ID. + */ + uint32_t id; + + /** + * ECC enabled flag for device + */ + uint32_t eccEnabled; + + /** + * The device UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid uuid; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The device name. This name is shared across all activity records + * representing instances of the device, and so should not be + * modified. + */ + const char *name; + + /** + * Flag to indicate whether the device is visible to CUDA. Users can + * set the device visibility using CUDA_VISIBLE_DEVICES environment + */ + uint8_t isCudaVisible; + + /** + * MIG enabled flag for device + */ + uint8_t isMigEnabled; + + uint8_t reserved[6]; + + /** + * GPU Instance id for MIG enabled devices. + * If mig mode is disabled value is set to UINT32_MAX + */ + uint32_t gpuInstanceId; + + /** + * Compute Instance id for MIG enabled devices. + * If mig mode is disabled value is set to UINT32_MAX + */ + uint32_t computeInstanceId; + + /** + * The MIG UUID. This value is the globally unique immutable + * alphanumeric identifier of the device. + */ + CUuuid migUuid; + +} CUpti_ActivityDevice4; + +/** + * \brief The activity record for kernel. (deprecated) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel9 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL + * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t cacheConfigRequested; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t cacheConfigExecuted; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the kernel. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the kernel. Each kernel execution + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the kernel. + */ + uint32_t runtimeCorrelationId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityKernel; + +/** + * \brief The activity record for kernel. (deprecated) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel9 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityKernel2; + +/** + * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards). + * (deprecated in CUDA 9.0) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL). + * Kernel activities are now reported using the CUpti_ActivityKernel9 activity + * record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityKernel3; + +/** + * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards). + * (deprecated in CUDA 11.0) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL). + * Kernel activities are now reported using the CUpti_ActivityKernel9 activity + * record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchronous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; +} CUpti_ActivityKernel4; + +/** + * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards). + * (deprecated in CUDA 11.2) + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel9 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchronous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; +} CUpti_ActivityKernel5; + +/** + * \brief The activity record for kernel. (deprecated in CUDA 11.6) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel9 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchronous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; + + /** + * The pointer to the access policy window. The structure CUaccessPolicyWindow is + * defined in cuda.h. + */ + CUaccessPolicyWindow *pAccessPolicyWindow; +} CUpti_ActivityKernel6; + +/** + * \brief The activity record for kernel. (deprecated in CUDA 11.8) + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated + * by CUPTI. Kernel activities are now reported using the + * CUpti_ActivityKernel9 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchronous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; + + /** + * The pointer to the access policy window. The structure CUaccessPolicyWindow is + * defined in cuda.h. + */ + CUaccessPolicyWindow *pAccessPolicyWindow; + + /** + * The ID of the HW channel on which the kernel is launched. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; +} CUpti_ActivityKernel7; + +/** + * \brief The activity record for kernel. + * + * This activity record represents a kernel execution + * (CUPTI_ACTIVITY_KIND_KERNEL and + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or + * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL. + */ + CUpti_ActivityKind kind; + + /** + * For devices with compute capability 7.0+ cacheConfig values are not updated + * in case field isSharedMemoryCarveoutRequested is set + */ + union { + uint8_t both; + struct { + /** + * The cache configuration requested by the kernel. The value is one + * of the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t requested:4; + + /** + * The cache configuration used for the kernel. The value is one of + * the CUfunc_cache enumeration values from cuda.h. + */ + uint8_t executed:4; + } config; + } cacheConfig; + + /** + * The shared memory configuration used for the kernel. The value is one of + * the CUsharedconfig enumeration values from cuda.h. + */ + uint8_t sharedMemoryConfig; + + /** + * The number of registers required for each thread executing the + * kernel. + */ + uint16_t registersPerThread; + + /** + * The partitioned global caching requested for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested; + + /** + * The partitioned global caching executed for the kernel. Partitioned + * global caching is required to enable caching on certain chips, such as + * devices with compute capability 5.2. Partitioned global caching can be + * automatically disabled if the occupancy requirement of the launch cannot + * support caching. + */ + CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted; + + /** + * The start timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t start; + + /** + * The end timestamp for the kernel execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the kernel. + */ + uint64_t end; + + /** + * The completed timestamp for the kernel execution, in ns. It + * represents the completion of all it's child kernels and the + * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that + * the completion time is unknown. + */ + uint64_t completed; + + /** + * The ID of the device where the kernel is executing. + */ + uint32_t deviceId; + + /** + * The ID of the context where the kernel is executing. + */ + uint32_t contextId; + + /** + * The ID of the stream where the kernel is executing. + */ + uint32_t streamId; + + /** + * The X-dimension grid size for the kernel. + */ + int32_t gridX; + + /** + * The Y-dimension grid size for the kernel. + */ + int32_t gridY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t gridZ; + + /** + * The X-dimension block size for the kernel. + */ + int32_t blockX; + + /** + * The Y-dimension block size for the kernel. + */ + int32_t blockY; + + /** + * The Z-dimension grid size for the kernel. + */ + int32_t blockZ; + + /** + * The static shared memory allocated for the kernel, in bytes. + */ + int32_t staticSharedMemory; + + /** + * The dynamic shared memory reserved for the kernel, in bytes. + */ + int32_t dynamicSharedMemory; + + /** + * The amount of local memory reserved for each thread, in bytes. + */ + uint32_t localMemoryPerThread; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes (deprecated in CUDA 11.8). + * Refer field localMemoryTotal_v2 + */ + uint32_t localMemoryTotal; + + /** + * The correlation ID of the kernel. Each kernel execution is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the kernel. + */ + uint32_t correlationId; + + /** + * The grid ID of the kernel. Each kernel is assigned a unique + * grid ID at runtime. + */ + int64_t gridId; + + /** + * The name of the kernel. This name is shared across all activity + * records representing the same kernel, and so should not be + * modified. + */ + const char *name; + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The timestamp when the kernel is queued up in the command buffer, in ns. + * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time + * could not be collected for the kernel. This timestamp is not collected + * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to + * enable collection. + * + * Command buffer is a buffer written by CUDA driver to send commands + * like kernel launch, memory copy etc to the GPU. All launches of CUDA + * kernels are asynchronous with respect to the host, the host requests + * the launch by writing commands into the command buffer, then returns + * without checking the GPU's progress. + */ + uint64_t queued; + + /** + * The timestamp when the command buffer containing the kernel launch + * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN + * indicates that the submitted time could not be collected for the kernel. + * This timestamp is not collected by default. Use API \ref + * cuptiActivityEnableLatencyTimestamps() to enable collection. + */ + uint64_t submitted; + + /** + * The indicates if the kernel was executed via a regular launch or via a + * single/multi device cooperative launch. \see CUpti_ActivityLaunchType + */ + uint8_t launchType; + + /** + * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was + * updated for the kernel launch + */ + uint8_t isSharedMemoryCarveoutRequested; + + /** + * Shared memory carveout value requested for the function in percentage of + * the total resource. The value will be updated only if field + * isSharedMemoryCarveoutRequested is set. + */ + uint8_t sharedMemoryCarveoutRequested; + + /** + * Undefined. Reserved for internal use. + */ + uint8_t padding; + + /** + * Shared memory size set by the driver. + */ + uint32_t sharedMemoryExecuted; + + /** + * The unique ID of the graph node that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint64_t graphNodeId; + + /** + * The shared memory limit config for the kernel. This field shows whether user has opted for a + * higher per block limit of dynamic shared memory. + */ + CUpti_FuncShmemLimitConfig shmemLimitConfig; + + /** + * The unique ID of the graph that launched this kernel through graph launch APIs. + * This field will be 0 if the kernel is not launched through graph launch APIs. + */ + uint32_t graphId; + + /** + * The pointer to the access policy window. The structure CUaccessPolicyWindow is + * defined in cuda.h. + */ + CUaccessPolicyWindow *pAccessPolicyWindow; + + /** + * The ID of the HW channel on which the kernel is launched. + */ + uint32_t channelID; + + /** + * The type of the channel + */ + CUpti_ChannelType channelType; + + /** + * The X-dimension cluster size for the kernel. + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterX; + + /** + * The Y-dimension cluster size for the kernel. + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterY; + + /** + * The Z-dimension cluster size for the kernel. + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterZ; + + /** + * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy + * Field is valid for devices with compute capability 9.0 and higher + */ + uint32_t clusterSchedulingPolicy; + + /** + * The total amount of local memory reserved for the kernel, in + * bytes. + */ + uint64_t localMemoryTotal_v2; +} CUpti_ActivityKernel8; + +/** + * \brief The activity record for memory copies. (deprecated) + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityMemcpy; + +/** + * \brief The activity record for memory copies. (deprecated in CUDA 11.1) + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint64_t graphNodeId; +} CUpti_ActivityMemcpy3; + +/** + * \brief The activity record for memory copies. (deprecated in CUDA 11.6) + * + * This activity record represents a memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the memory copy. + */ + uint32_t correlationId; + + /** + * The runtime correlation ID of the memory copy. Each memory copy + * is assigned a unique runtime correlation ID that is identical to + * the correlation ID in the runtime API activity record that + * launched the memory copy. + */ + uint32_t runtimeCorrelationId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint32_t graphId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t padding; +} CUpti_ActivityMemcpy4; + +/** + * \brief The activity record for peer-to-peer memory copies. + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated + * by CUPTI. Peer-to-peer memory copy activities are now reported using the + * CUpti_ActivityMemcpyPtoP2 activity record.. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityMemcpyPtoP; + +typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2; + +/** + * \brief The activity record for peer-to-peer memory copies. + * (deprecated in CUDA 11.1) + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed the memcpy through graph launch. + * This field will be 0 if memcpy is not done using graph launch. + */ + uint64_t graphNodeId; +} CUpti_ActivityMemcpyPtoP2; + +/** + * \brief The activity record for peer-to-peer memory copies. + * (deprecated in CUDA 11.6) + * + * This activity record represents a peer-to-peer memory copy + * (CUPTI_ACTIVITY_KIND_MEMCPY2). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2. + */ + CUpti_ActivityKind kind; + + /** + * The kind of the memory copy, stored as a byte to reduce record + * size. \see CUpti_ActivityMemcpyKind + */ + uint8_t copyKind; + + /** + * The source memory kind read by the memory copy, stored as a byte + * to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t srcKind; + + /** + * The destination memory kind read by the memory copy, stored as a + * byte to reduce record size. \see CUpti_ActivityMemoryKind + */ + uint8_t dstKind; + + /** + * The flags associated with the memory copy. \see + * CUpti_ActivityFlag + */ + uint8_t flags; + + /** + * The number of bytes transferred by the memory copy. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t start; + + /** + * The end timestamp for the memory copy, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory copy. + */ + uint64_t end; + + /** + * The ID of the device where the memory copy is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory copy is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory copy is occurring. + */ + uint32_t streamId; + + /** + * The ID of the device where memory is being copied from. + */ + uint32_t srcDeviceId; + + /** + * The ID of the context owning the memory being copied from. + */ + uint32_t srcContextId; + + /** + * The ID of the device where memory is being copied to. + */ + uint32_t dstDeviceId; + + /** + * The ID of the context owning the memory being copied to. + */ + uint32_t dstContextId; + + /** + * The correlation ID of the memory copy. Each memory copy is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory copy. + */ + uint32_t correlationId; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed the memcpy through graph launch. + * This field will be 0 if memcpy is not done using graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memcpy through graph launch. + * This field will be 0 if the memcpy is not done through graph launch. + */ + uint32_t graphId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t padding; +} CUpti_ActivityMemcpyPtoP3; + +/** + * \brief The activity record for memset. (deprecated) + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; +} CUpti_ActivityMemset; + +/** + * \brief The activity record for memset. (deprecated in CUDA 11.1) + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint64_t graphNodeId; +} CUpti_ActivityMemset2; + +/** + * \brief The activity record for memset. (deprecated in CUDA 11.6) + * + * This activity record represents a memory set operation + * (CUPTI_ACTIVITY_KIND_MEMSET). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET. + */ + CUpti_ActivityKind kind; + + /** + * The value being assigned to memory by the memory set. + */ + uint32_t value; + + /** + * The number of bytes being set by the memory set. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t start; + + /** + * The end timestamp for the memory set, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the memory set. + */ + uint64_t end; + + /** + * The ID of the device where the memory set is occurring. + */ + uint32_t deviceId; + + /** + * The ID of the context where the memory set is occurring. + */ + uint32_t contextId; + + /** + * The ID of the stream where the memory set is occurring. + */ + uint32_t streamId; + + /** + * The correlation ID of the memory set. Each memory set is assigned + * a unique correlation ID that is identical to the correlation ID + * in the driver API activity record that launched the memory set. + */ + uint32_t correlationId; + + /** + * The flags associated with the memset. \see CUpti_ActivityFlag + */ + uint16_t flags; + + /** + * The memory kind of the memory set \see CUpti_ActivityMemoryKind + */ + uint16_t memoryKind; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * Undefined. Reserved for internal use. + */ + void *reserved0; + + /** + * The unique ID of the graph node that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint64_t graphNodeId; + + /** + * The unique ID of the graph that executed this memset through graph launch. + * This field will be 0 if the memset is not executed through graph launch. + */ + uint32_t graphId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t padding; +} CUpti_ActivityMemset3; + +/** + * \brief The activity record for memory. + * + * This activity record represents a memory allocation and free operation + * (CUPTI_ACTIVITY_KIND_MEMORY2). + * This activity record provides separate records for memory allocation and + * memory release operations. + * This allows to correlate the corresponding driver and runtime API + * activity record with the memory operation. + * + * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory + * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY. + * \ref CUpti_ActivityMemory provides a single record for the memory + * allocation and memory release operations. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2 + */ + CUpti_ActivityKind kind; + + /** + * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType. + */ + CUpti_ActivityMemoryOperationType memoryOperationType; + + /** + * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind. + */ + CUpti_ActivityMemoryKind memoryKind; + + /** + * The correlation ID of the memory operation. Each memory operation is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory operation. + */ + uint32_t correlationId; + + /** + * The virtual address of the allocation. + */ + uint64_t address; + + /** + * The number of bytes of memory allocated. + */ + uint64_t bytes; + + /** + * The start timestamp for the memory operation, in ns. + */ + uint64_t timestamp; + + /** + * The program counter of the memory operation. + */ + uint64_t PC; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory operation is taking place. + */ + uint32_t deviceId; + + /** + * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID. + */ + uint32_t contextId; + + /** + * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID. + */ + uint32_t streamId; + + /** + * Variable name. This name is shared across all activity + * records representing the same symbol, and so should not be + * modified. + */ + const char* name; + + /** + * \p isAsync is set if memory operation happens through async memory APIs. + */ + uint32_t isAsync; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad1; +#endif + + /** + * The memory pool configuration used for the memory operations. + */ + struct { + /** + * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType + */ + CUpti_ActivityMemoryPoolType memoryPoolType; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad2; +#endif + + /** + * The base address of the memory pool. + */ + uint64_t address; + + /** + * The release threshold of the memory pool in bytes. \p releaseThreshold is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t releaseThreshold; + + /** + * The size of the memory pool in bytes and the processID of the memory pool. + * \p size is valid if \p memoryPoolType is + * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + * \p processId is valid if \p memoryPoolType is + * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType. + */ + union { + uint64_t size; + uint64_t processId; + } pool; + } memoryPoolConfig; + +} CUpti_ActivityMemory2; + +/** + * \brief The activity record for memory pool. + * + * This activity record represents a memory pool creation, destruction and + * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL). + * This activity record provides separate records for memory pool creation, + * destruction and trimming operations. + * This allows to correlate the corresponding driver and runtime API + * activity record with the memory pool operation. + * + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL + */ + CUpti_ActivityKind kind; + + /** + * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType. + */ + CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType; + + /** + * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType + */ + CUpti_ActivityMemoryPoolType memoryPoolType; + + /** + * The correlation ID of the memory pool operation. Each memory pool + * operation is assigned a unique correlation ID that is identical to the + * correlation ID in the driver and runtime API activity record that + * launched the memory operation. + */ + uint32_t correlationId; + + /** + * The ID of the process to which this record belongs to. + */ + uint32_t processId; + + /** + * The ID of the device where the memory pool is created. + */ + uint32_t deviceId; + + /** + * The minimum bytes to keep of the memory pool. \p minBytesToKeep is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED, + * \ref CUpti_ActivityMemoryPoolOperationType + */ + size_t minBytesToKeep; + +#ifndef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The virtual address of the allocation. + */ + uint64_t address; + + /** + * The size of the memory pool operation in bytes. \p size is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t size; + + /** + * The release threshold of the memory pool. \p releaseThreshold is + * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType. + */ + uint64_t releaseThreshold; + + /** + * The start timestamp for the memory operation, in ns. + */ + uint64_t timestamp; +} CUpti_ActivityMemoryPool; + +/** + * \brief The activity record providing a marker which is an + * instantaneous point in time. (deprecated in CUDA 8.0) + * + * The marker is specified with a descriptive name and unique id + * (CUPTI_ACTIVITY_KIND_MARKER). + * Marker activity is now reported using the + * CUpti_ActivityMarker2 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER. + */ + CUpti_ActivityKind kind; + + /** + * The flags associated with the marker. \see CUpti_ActivityFlag + */ + CUpti_ActivityFlag flags; + + /** + * The timestamp for the marker, in ns. A value of 0 indicates that + * timestamp information could not be collected for the marker. + */ + uint64_t timestamp; + + /** + * The marker ID. + */ + uint32_t id; + + /** + * The kind of activity object associated with this marker. + */ + CUpti_ActivityObjectKind objectKind; + + /** + * The identifier for the activity object associated with this + * marker. 'objectKind' indicates which ID is valid for this record. + */ + CUpti_ActivityObjectKindId objectId; + +#ifdef CUPTILP64 + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +#endif + + /** + * The marker name for an instantaneous or start marker. This will + * be NULL for an end marker. + */ + const char *name; + +} CUpti_ActivityMarker; + +/** + * \brief The activity record for source-level global + * access. (deprecated) + * + * This activity records the locations of the global + * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS). + * Global access activities are now reported using the + * CUpti_ActivityGlobalAccess3 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this global access. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * The pc offset for the access. + */ + uint32_t pcOffset; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * when at least one of thread among warp is active with predicate and condition code + * evaluating to true. + */ + uint32_t executed; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction with predicate and condition code evaluating to true. + */ + uint64_t threadsExecuted; + + /** + * The total number of 32 bytes transactions to L2 cache generated by this access + */ + uint64_t l2_transactions; +} CUpti_ActivityGlobalAccess; + +/** + * \brief The activity record for source-level global + * access. (deprecated in CUDA 9.0) + * + * This activity records the locations of the global + * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS). + * Global access activities are now reported using the + * CUpti_ActivityGlobalAccess3 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this global access. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the access. + */ + uint32_t pcOffset; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction with predicate and condition code evaluating to true. + */ + uint64_t threadsExecuted; + + /** + * The total number of 32 bytes transactions to L2 cache generated by this access + */ + uint64_t l2_transactions; + + /** + * The minimum number of L2 transactions possible based on the access pattern. + */ + uint64_t theoreticalL2Transactions; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * when at least one of thread among warp is active with predicate and condition code + * evaluating to true. + */ + uint32_t executed; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityGlobalAccess2; + +/** + * \brief The activity record for source level result + * branch. (deprecated) + * + * This activity record the locations of the branches in the + * source (CUPTI_ACTIVITY_KIND_BRANCH). + * Branch activities are now reported using the + * CUpti_ActivityBranch2 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH. + */ + CUpti_ActivityKind kind; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * The pc offset for the branch. + */ + uint32_t pcOffset; + + /** + * The number of times this instruction was executed per warp. It will be incremented + * regardless of predicate or condition code. + */ + uint32_t executed; + + /** + * Number of times this branch diverged + */ + uint32_t diverged; + + /** + * This increments each time when this instruction is executed by number + * of threads that executed this instruction + */ + uint64_t threadsExecuted; +} CUpti_ActivityBranch; + +/** + * \brief The activity record for PC sampling. (deprecated in CUDA 8.0) + * + * This activity records information obtained by sampling PC + * (CUPTI_ACTIVITY_KIND_PC_SAMPLING). + * PC sampling activities are now reported using the + * CUpti_ActivityPCSampling2 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the instruction. + */ + uint32_t pcOffset; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * The same PC can be sampled with different stall reasons. + */ + uint32_t samples; + + /** + * Current stall reason. Includes one of the reasons from + * \ref CUpti_ActivityPCSamplingStallReason + */ + CUpti_ActivityPCSamplingStallReason stallReason; +} CUpti_ActivityPCSampling; + +/** + * \brief The activity record for PC sampling. (deprecated in CUDA 9.0) + * + * This activity records information obtained by sampling PC + * (CUPTI_ACTIVITY_KIND_PC_SAMPLING). + * PC sampling activities are now reported using the + * CUpti_ActivityPCSampling3 activity record. + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING. + */ + CUpti_ActivityKind kind; + + /** + * The properties of this instruction. + */ + CUpti_ActivityFlag flags; + + /** + * The ID for source locator. + */ + uint32_t sourceLocatorId; + + /** + * The correlation ID of the kernel to which this result is associated. + */ + uint32_t correlationId; + + /** + * Correlation ID with global/device function name + */ + uint32_t functionId; + + /** + * The pc offset for the instruction. + */ + uint32_t pcOffset; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * These samples indicate that no instruction was issued in that cycle from + * the warp scheduler from where the warp was sampled. + * Field is valid for devices with compute capability 6.0 and higher + */ + uint32_t latencySamples; + + /** + * Number of times the PC was sampled with the stallReason in the record. + * The same PC can be sampled with different stall reasons. The count includes + * latencySamples. + */ + uint32_t samples; + + /** + * Current stall reason. Includes one of the reasons from + * \ref CUpti_ActivityPCSamplingStallReason + */ + CUpti_ActivityPCSamplingStallReason stallReason; + + uint32_t pad; +} CUpti_ActivityPCSampling2; + +/** + * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0) + * + * This activity record represents a Unified Memory counter + * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER). + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER + */ + CUpti_ActivityKind kind; + + /** + * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind + */ + CUpti_ActivityUnifiedMemoryCounterKind counterKind; + + /** + * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope + */ + CUpti_ActivityUnifiedMemoryCounterScope scope; + + /** + * The ID of the device involved in the memory transfer operation. + * It is not relevant if the scope of the counter is global (all devices). + */ + uint32_t deviceId; + + /** + * Value of the counter + * + */ + uint64_t value; + + /** + * The timestamp when this sample was retrieved, in ns. A value of 0 + * indicates that timestamp information could not be collected + */ + uint64_t timestamp; + + /** + * The ID of the process to which this record belongs to. In case of + * global scope, processId is undefined. + */ + uint32_t processId; + + /** + * Undefined. Reserved for internal use. + */ + uint32_t pad; +} CUpti_ActivityUnifiedMemoryCounter; + +/** +* \brief NVLink information. (deprecated in CUDA 9.0) +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +* NVLink information are now reported using the +* CUpti_ActivityNvLink2 activity record. +*/ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + + /** + * NVLink version. + */ + uint32_t nvlinkVersion; + + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + + /** + * Port numbers for maximum 4 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[4]; + + /** + * Port numbers for maximum 4 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[4]; + + /** + * Bandwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; +} CUpti_ActivityNvLink; + +/** +* \brief NVLink information. (deprecated in CUDA 10.0) +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +* NvLink information are now reported using the +* CUpti_ActivityNvLink4 activity record. +*/ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + + /** + * NvLink version. + */ + uint32_t nvlinkVersion; + + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + + /** + * Port numbers for maximum 16 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[CUPTI_MAX_NVLINK_PORTS]; + + /** + * Port numbers for maximum 16 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[CUPTI_MAX_NVLINK_PORTS]; + + /** + * Bandwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; +} CUpti_ActivityNvLink2; + +/** +* \brief NVLink information. +* +* This structure gives capabilities of each logical NVLink connection between two devices, +* gpu<->gpu or gpu<->CPU which can be used to understand the topology. +* NvLink information are now reported using the +* CUpti_ActivityNvLink4 activity record. +*/ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK. + */ + CUpti_ActivityKind kind; + /** + * NvLink version. + */ + uint32_t nvlinkVersion; + + /** + * Type of device 0 \ref CUpti_DevType + */ + CUpti_DevType typeDev0; + + /** + * Type of device 1 \ref CUpti_DevType + */ + CUpti_DevType typeDev1; + + /** + * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev0; + + /** + * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU. + */ + union { + CUuuid uuidDev; + struct { + /** + * Index of the NPU. First index will always be zero. + */ + uint32_t index; + + /** + * Domain ID of NPU. On Linux, this can be queried using lspci. + */ + uint32_t domainId; + } npu; + } idDev1; + + /** + * Flag gives capabilities of the link \see CUpti_LinkFlag + */ + uint32_t flag; + + /** + * Number of physical NVLinks present between two devices. + */ + uint32_t physicalNvLinkCount; + + /** + * Port numbers for maximum 16 NVLinks connected to device 0. + * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev0[CUPTI_MAX_NVLINK_PORTS]; + + /** + * Port numbers for maximum 16 NVLinks connected to device 1. + * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field. + * In case of invalid/unknown port number, this field will be set + * to value CUPTI_NVLINK_INVALID_PORT. + * This will be used to correlate the metric values to individual + * physical link and attribute traffic to the logical NVLink in + * the topology. + */ + int8_t portDev1[CUPTI_MAX_NVLINK_PORTS]; + + /** + * Bandwidth of NVLink in kbytes/sec + */ + uint64_t bandwidth; + + /** + * NVSwitch is connected as an intermediate node. + */ + uint8_t nvswitchConnected; + + /** + * Undefined. reserved for internal use + */ + uint8_t pad[7]; +} CUpti_ActivityNvLink3; + +/** + * \brief The activity record for trace of graph execution. + * + * This activity record represents execution for a graph without giving visibility + * about the execution of its nodes. This is intended to reduce overheads in tracing + * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE + * Graph trace activity is now reported using CUpti_ActivityGraphTrace2 record. + */ +typedef struct { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE + */ + CUpti_ActivityKind kind; + + /** + * The correlation ID of the graph launch. Each graph launch is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver API activity record that launched + * the graph. + */ + uint32_t correlationId; + + /** + * The start timestamp for the graph execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the graph. + */ + uint64_t start; + + /** + * The end timestamp for the graph execution, in ns. A value of 0 + * for both the start and end timestamps indicates that timestamp + * information could not be collected for the graph. + */ + uint64_t end; + + /** + * The ID of the device where the graph execution is occurring. + */ + uint32_t deviceId; + + /** + * The unique ID of the graph that is launched. + */ + uint32_t graphId; + + /** + * The ID of the context where the graph is being launched. + */ + uint32_t contextId; + + /** + * The ID of the stream where the graph is being launched. + */ + uint32_t streamId; + + /** + * This field is reserved for internal use + */ + void *reserved; +} CUpti_ActivityGraphTrace; + +/** + * \brief The activity record for a context. + * + * This activity record represents information about a context + * (CUPTI_ACTIVITY_KIND_CONTEXT). + * Context activity is now reported using CUpti_ActivityContext2 record + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT. + */ + CUpti_ActivityKind kind; + + /** + * The context ID. + */ + uint32_t contextId; + + /** + * The device ID. + */ + uint32_t deviceId; + + /** + * The compute API kind. \see CUpti_ActivityComputeApiKind + */ + uint16_t computeApiKind; + + /** + * The ID for the NULL stream in this context + */ + uint16_t nullStreamId; +} CUpti_ActivityContext; + +/** + * \brief The activity record for JIT operations. + * This activity represents the JIT operations (compile, load, store) of a CUmodule + * from the Compute Cache. + * Gives the exact hashed path of where the cached module is loaded from, + * or where the module will be stored after Just-In-Time (JIT) compilation. + * + * JIT activity is now reported using CUpti_ActivityJit2 record + */ +typedef struct PACKED_ALIGNMENT { + /** + * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT. + */ + CUpti_ActivityKind kind; + + /** + * The JIT entry type. + */ + CUpti_ActivityJitEntryType jitEntryType; + + /** + * The JIT operation type. + */ + CUpti_ActivityJitOperationType jitOperationType; + + /** + * The device ID. + */ + uint32_t deviceId; + + /** + * The start timestamp for the JIT operation, in ns. A value of 0 for + * both the start and end timestamps indicates that timestamp + * information could not be collected for the JIT operation. + */ + uint64_t start; + + /** + * The end timestamp for the JIT operation, in ns. A value of 0 for both + * the start and end timestamps indicates that timestamp information + * could not be collected for the JIT operation. + */ + uint64_t end; + + /** + * The correlation ID of the JIT operation to which + * records belong to. Each JIT operation is + * assigned a unique correlation ID that is identical to the + * correlation ID in the driver or runtime API activity record that + * launched the JIT operation. + */ + uint32_t correlationId; + + /** + * Internal use. + */ + uint32_t padding; + + /** + * The correlation ID to correlate JIT compilation, load and store operations. + * Each JIT compilation unit is assigned a unique correlation ID + * at the time of the JIT compilation. This correlation id can be used + * to find the matching JIT cache load/store records. + */ + uint64_t jitOperationCorrelationId; + + /** + * The size of compute cache. + */ + uint64_t cacheSize; + + /** + * The path where the fat binary is cached. + */ + const char* cachePath; +} CUpti_ActivityJit; + + +#if defined(__GNUC__) && defined(CUPTI_LIB) + #pragma GCC visibility pop +#endif + +#if defined(__cplusplus) +} +#endif + +#endif /*_CUPTI_ACTIVITY_DEPRECATED_H_*/