diff --git "a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h" "b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h"
new file mode 100644--- /dev/null
+++ "b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h"
@@ -0,0 +1,4784 @@
+/*
+ * Copyright 2011-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_DEPRECATED_H_)
+#define _CUPTI_ACTIVITY_DEPRECATED_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_ActivityOverhead
+ * \see CUpti_ActivityOverhead2
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ */
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ * (Deprecated in CUDA 12.2)
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD). These records are now reported using
+ * CUpti_ActivityOverhead3
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+} CUpti_ActivityOverhead;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+} CUpti_ActivityOverhead2;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice2;
+
+/**
+ * \brief The activity record for a device. (CUDA 7.0 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  uint8_t reserved[7];
+} CUpti_ActivityDevice3;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+} CUpti_ActivityDevice4;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
+   * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The cache configuration requested by the kernel. The value is one
+   * of the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigRequested;
+
+  /**
+   * The cache configuration used for the kernel. The value is one of
+   * the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigExecuted;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the kernel. Each kernel execution
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t runtimeCorrelationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel2;
+
+/**
+ * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
+ * (deprecated in CUDA 9.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel3;
+
+/**
+ * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
+ * (deprecated in CUDA 11.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+} CUpti_ActivityKernel4;
+
+/**
+ * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
+ * (deprecated in CUDA 11.2)
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+} CUpti_ActivityKernel5;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+} CUpti_ActivityKernel6;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.8)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityKernel7;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+} CUpti_ActivityKernel8;
+
+/**
+ * \brief The activity record for memory copies. (deprecated)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpy;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpy3;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpy4;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
+ * by CUPTI. Peer-to-peer memory copy activities are now reported using the
+ * CUpti_ActivityMemcpyPtoP2 activity record..
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpyPtoP;
+
+typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpyPtoP2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpyPtoP3;
+
+/**
+ * \brief The activity record for memset. (deprecated)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemset;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemset2;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemset3;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+   /**
+   * The size of the memory pool in bytes and the processID of the memory pool.
+   * \p size is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   * \p processId is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+   */
+   union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory2;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+} CUpti_ActivityMemoryPool;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time. (deprecated in CUDA 8.0)
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ * Marker activity is now reported using the
+ * CUpti_ActivityMarker2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+} CUpti_ActivityMarker;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+} CUpti_ActivityGlobalAccess;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated in CUDA 9.0)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityGlobalAccess2;
+
+/**
+ * \brief The activity record for source level result
+ * branch. (deprecated)
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ * Branch activities are now reported using the
+ * CUpti_ActivityBranch2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+} CUpti_ActivityBranch;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+} CUpti_ActivityPCSampling;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  uint32_t pad;
+} CUpti_ActivityPCSampling2;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * The ID of the device involved in the memory transfer operation.
+   * It is not relevant if the scope of the counter is global (all devices).
+   */
+  uint32_t deviceId;
+
+  /**
+   * Value of the counter
+   *
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected
+   */
+  uint64_t timestamp;
+
+  /**
+   * The ID of the process to which this record belongs to. In case of
+   * global scope, processId is undefined.
+   */
+  uint32_t processId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 9.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NVLink information are now reported using the
+* CUpti_ActivityNvLink2 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+  * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+  */
+  CUpti_ActivityKind kind;
+
+  /**
+  * NVLink version.
+  */
+  uint32_t nvlinkVersion;
+
+  /**
+  * Type of device 0 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev0;
+
+  /**
+  * Type of device 1 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+  * Flag gives capabilities of the link \see CUpti_LinkFlag
+  */
+  uint32_t flag;
+
+  /**
+  * Number of physical NVLinks present between two devices.
+  */
+  uint32_t physicalNvLinkCount;
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 0.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev0[4];
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 1.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev1[4];
+
+  /**
+  * Bandwidth of NVLink in kbytes/sec
+  */
+  uint64_t bandwidth;
+} CUpti_ActivityNvLink;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 10.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+} CUpti_ActivityNvLink2;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink3;
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ * Graph trace activity is now reported using CUpti_ActivityGraphTrace2 record.
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the graph execution is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the graph is being launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+} CUpti_ActivityGraphTrace;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+} CUpti_ActivityContext;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ *
+ * JIT activity is now reported using CUpti_ActivityJit2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+} CUpti_ActivityJit;
+
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_ACTIVITY_DEPRECATED_H_*/