diff --git a/.gitattributes b/.gitattributes
index a775da618208b36c6270e405ed3b309c9ff56403..91b626d9ae9e420a922728173caa998b7a279581 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -409,3 +409,6 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
 .venv/lib/python3.11/site-packages/aiohttp/_websocket/reader_c.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/opencv_python_headless.libs/libssl-28bef1ac.so.1.1 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/opencv_python_headless.libs/libavutil-734d06dd.so.57.28.100 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libavcodec-76c43bf0.so.59.37.100 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc-builtins.so.12.4 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/cv2/cv2.abi3.so filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/__init__.py b/.venv/lib/python3.11/site-packages/charset_normalizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d3a37990145e94ad85406166dbaf52f4c311e5e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/__init__.py
@@ -0,0 +1,48 @@
+"""
+Charset-Normalizer
+~~~~~~~~~~~~~~
+The Real First Universal Charset Detector.
+A library that helps you read text from an unknown charset encoding.
+Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
+All IANA character set names for which the Python core library provides codecs are supported.
+
+Basic usage:
+   >>> from charset_normalizer import from_bytes
+   >>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
+   >>> best_guess = results.best()
+   >>> str(best_guess)
+   'Bсеки човек има право на образование. Oбразованието!'
+
+Others methods and usages are available - see the full documentation
+at <https://github.com/Ousret/charset_normalizer>.
+:copyright: (c) 2021 by Ahmed TAHRI
+:license: MIT, see LICENSE for more details.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from .api import from_bytes, from_fp, from_path, is_binary
+from .legacy import detect
+from .models import CharsetMatch, CharsetMatches
+from .utils import set_logging_handler
+from .version import VERSION, __version__
+
+__all__ = (
+    "from_fp",
+    "from_path",
+    "from_bytes",
+    "is_binary",
+    "detect",
+    "CharsetMatch",
+    "CharsetMatches",
+    "__version__",
+    "VERSION",
+    "set_logging_handler",
+)
+
+# Attach a NullHandler to the top level logger by default
+# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
+
+logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/__main__.py b/.venv/lib/python3.11/site-packages/charset_normalizer/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e76f7bfbb411d4424d3a1834b0ea803d80ea7e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/__main__.py
@@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from .cli import cli_detect
+
+if __name__ == "__main__":
+    cli_detect()
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/api.py b/.venv/lib/python3.11/site-packages/charset_normalizer/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c8c0618cc5ad2e92380edb194a5d9b8b2d977c2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/api.py
@@ -0,0 +1,668 @@
+from __future__ import annotations
+
+import logging
+from os import PathLike
+from typing import BinaryIO
+
+from .cd import (
+    coherence_ratio,
+    encoding_languages,
+    mb_encoding_languages,
+    merge_coherence_ratios,
+)
+from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
+from .md import mess_ratio
+from .models import CharsetMatch, CharsetMatches
+from .utils import (
+    any_specified_encoding,
+    cut_sequence_chunks,
+    iana_name,
+    identify_sig_or_bom,
+    is_cp_similar,
+    is_multi_byte_encoding,
+    should_strip_sig_or_bom,
+)
+
+logger = logging.getLogger("charset_normalizer")
+explain_handler = logging.StreamHandler()
+explain_handler.setFormatter(
+    logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
+)
+
+
+def from_bytes(
+    sequences: bytes | bytearray,
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.2,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = True,
+) -> CharsetMatches:
+    """
+    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
+    If there is no results, it is a strong indicator that the source is binary/not text.
+    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
+    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
+
+    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
+    but never take it for granted. Can improve the performance.
+
+    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
+    purpose.
+
+    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
+    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
+    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
+    Custom logging format and handler can be set manually.
+    """
+
+    if not isinstance(sequences, (bytearray, bytes)):
+        raise TypeError(
+            "Expected object of type bytes or bytearray, got: {}".format(
+                type(sequences)
+            )
+        )
+
+    if explain:
+        previous_logger_level: int = logger.level
+        logger.addHandler(explain_handler)
+        logger.setLevel(TRACE)
+
+    length: int = len(sequences)
+
+    if length == 0:
+        logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
+        if explain:  # Defensive: ensure exit path clean handler
+            logger.removeHandler(explain_handler)
+            logger.setLevel(previous_logger_level or logging.WARNING)
+        return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
+
+    if cp_isolation is not None:
+        logger.log(
+            TRACE,
+            "cp_isolation is set. use this flag for debugging purpose. "
+            "limited list of encoding allowed : %s.",
+            ", ".join(cp_isolation),
+        )
+        cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
+    else:
+        cp_isolation = []
+
+    if cp_exclusion is not None:
+        logger.log(
+            TRACE,
+            "cp_exclusion is set. use this flag for debugging purpose. "
+            "limited list of encoding excluded : %s.",
+            ", ".join(cp_exclusion),
+        )
+        cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
+    else:
+        cp_exclusion = []
+
+    if length <= (chunk_size * steps):
+        logger.log(
+            TRACE,
+            "override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
+            steps,
+            chunk_size,
+            length,
+        )
+        steps = 1
+        chunk_size = length
+
+    if steps > 1 and length / steps < chunk_size:
+        chunk_size = int(length / steps)
+
+    is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
+    is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
+
+    if is_too_small_sequence:
+        logger.log(
+            TRACE,
+            "Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
+                length
+            ),
+        )
+    elif is_too_large_sequence:
+        logger.log(
+            TRACE,
+            "Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
+                length
+            ),
+        )
+
+    prioritized_encodings: list[str] = []
+
+    specified_encoding: str | None = (
+        any_specified_encoding(sequences) if preemptive_behaviour else None
+    )
+
+    if specified_encoding is not None:
+        prioritized_encodings.append(specified_encoding)
+        logger.log(
+            TRACE,
+            "Detected declarative mark in sequence. Priority +1 given for %s.",
+            specified_encoding,
+        )
+
+    tested: set[str] = set()
+    tested_but_hard_failure: list[str] = []
+    tested_but_soft_failure: list[str] = []
+
+    fallback_ascii: CharsetMatch | None = None
+    fallback_u8: CharsetMatch | None = None
+    fallback_specified: CharsetMatch | None = None
+
+    results: CharsetMatches = CharsetMatches()
+
+    early_stop_results: CharsetMatches = CharsetMatches()
+
+    sig_encoding, sig_payload = identify_sig_or_bom(sequences)
+
+    if sig_encoding is not None:
+        prioritized_encodings.append(sig_encoding)
+        logger.log(
+            TRACE,
+            "Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
+            len(sig_payload),
+            sig_encoding,
+        )
+
+    prioritized_encodings.append("ascii")
+
+    if "utf_8" not in prioritized_encodings:
+        prioritized_encodings.append("utf_8")
+
+    for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
+        if cp_isolation and encoding_iana not in cp_isolation:
+            continue
+
+        if cp_exclusion and encoding_iana in cp_exclusion:
+            continue
+
+        if encoding_iana in tested:
+            continue
+
+        tested.add(encoding_iana)
+
+        decoded_payload: str | None = None
+        bom_or_sig_available: bool = sig_encoding == encoding_iana
+        strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
+            encoding_iana
+        )
+
+        if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
+            logger.log(
+                TRACE,
+                "Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
+                encoding_iana,
+            )
+            continue
+        if encoding_iana in {"utf_7"} and not bom_or_sig_available:
+            logger.log(
+                TRACE,
+                "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
+                encoding_iana,
+            )
+            continue
+
+        try:
+            is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
+        except (ModuleNotFoundError, ImportError):
+            logger.log(
+                TRACE,
+                "Encoding %s does not provide an IncrementalDecoder",
+                encoding_iana,
+            )
+            continue
+
+        try:
+            if is_too_large_sequence and is_multi_byte_decoder is False:
+                str(
+                    (
+                        sequences[: int(50e4)]
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) : int(50e4)]
+                    ),
+                    encoding=encoding_iana,
+                )
+            else:
+                decoded_payload = str(
+                    (
+                        sequences
+                        if strip_sig_or_bom is False
+                        else sequences[len(sig_payload) :]
+                    ),
+                    encoding=encoding_iana,
+                )
+        except (UnicodeDecodeError, LookupError) as e:
+            if not isinstance(e, LookupError):
+                logger.log(
+                    TRACE,
+                    "Code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+            tested_but_hard_failure.append(encoding_iana)
+            continue
+
+        similar_soft_failure_test: bool = False
+
+        for encoding_soft_failed in tested_but_soft_failure:
+            if is_cp_similar(encoding_iana, encoding_soft_failed):
+                similar_soft_failure_test = True
+                break
+
+        if similar_soft_failure_test:
+            logger.log(
+                TRACE,
+                "%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
+                encoding_iana,
+                encoding_soft_failed,
+            )
+            continue
+
+        r_ = range(
+            0 if not bom_or_sig_available else len(sig_payload),
+            length,
+            int(length / steps),
+        )
+
+        multi_byte_bonus: bool = (
+            is_multi_byte_decoder
+            and decoded_payload is not None
+            and len(decoded_payload) < length
+        )
+
+        if multi_byte_bonus:
+            logger.log(
+                TRACE,
+                "Code page %s is a multi byte encoding table and it appear that at least one character "
+                "was encoded using n-bytes.",
+                encoding_iana,
+            )
+
+        max_chunk_gave_up: int = int(len(r_) / 4)
+
+        max_chunk_gave_up = max(max_chunk_gave_up, 2)
+        early_stop_count: int = 0
+        lazy_str_hard_failure = False
+
+        md_chunks: list[str] = []
+        md_ratios = []
+
+        try:
+            for chunk in cut_sequence_chunks(
+                sequences,
+                encoding_iana,
+                r_,
+                chunk_size,
+                bom_or_sig_available,
+                strip_sig_or_bom,
+                sig_payload,
+                is_multi_byte_decoder,
+                decoded_payload,
+            ):
+                md_chunks.append(chunk)
+
+                md_ratios.append(
+                    mess_ratio(
+                        chunk,
+                        threshold,
+                        explain is True and 1 <= len(cp_isolation) <= 2,
+                    )
+                )
+
+                if md_ratios[-1] >= threshold:
+                    early_stop_count += 1
+
+                if (early_stop_count >= max_chunk_gave_up) or (
+                    bom_or_sig_available and strip_sig_or_bom is False
+                ):
+                    break
+        except (
+            UnicodeDecodeError
+        ) as e:  # Lazy str loading may have missed something there
+            logger.log(
+                TRACE,
+                "LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
+                encoding_iana,
+                str(e),
+            )
+            early_stop_count = max_chunk_gave_up
+            lazy_str_hard_failure = True
+
+        # We might want to check the sequence again with the whole content
+        # Only if initial MD tests passes
+        if (
+            not lazy_str_hard_failure
+            and is_too_large_sequence
+            and not is_multi_byte_decoder
+        ):
+            try:
+                sequences[int(50e3) :].decode(encoding_iana, errors="strict")
+            except UnicodeDecodeError as e:
+                logger.log(
+                    TRACE,
+                    "LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
+                    encoding_iana,
+                    str(e),
+                )
+                tested_but_hard_failure.append(encoding_iana)
+                continue
+
+        mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
+        if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
+            tested_but_soft_failure.append(encoding_iana)
+            logger.log(
+                TRACE,
+                "%s was excluded because of initial chaos probing. Gave up %i time(s). "
+                "Computed mean chaos is %f %%.",
+                encoding_iana,
+                early_stop_count,
+                round(mean_mess_ratio * 100, ndigits=3),
+            )
+            # Preparing those fallbacks in case we got nothing.
+            if (
+                enable_fallback
+                and encoding_iana in ["ascii", "utf_8", specified_encoding]
+                and not lazy_str_hard_failure
+            ):
+                fallback_entry = CharsetMatch(
+                    sequences,
+                    encoding_iana,
+                    threshold,
+                    False,
+                    [],
+                    decoded_payload,
+                    preemptive_declaration=specified_encoding,
+                )
+                if encoding_iana == specified_encoding:
+                    fallback_specified = fallback_entry
+                elif encoding_iana == "ascii":
+                    fallback_ascii = fallback_entry
+                else:
+                    fallback_u8 = fallback_entry
+            continue
+
+        logger.log(
+            TRACE,
+            "%s passed initial chaos probing. Mean measured chaos is %f %%",
+            encoding_iana,
+            round(mean_mess_ratio * 100, ndigits=3),
+        )
+
+        if not is_multi_byte_decoder:
+            target_languages: list[str] = encoding_languages(encoding_iana)
+        else:
+            target_languages = mb_encoding_languages(encoding_iana)
+
+        if target_languages:
+            logger.log(
+                TRACE,
+                "{} should target any language(s) of {}".format(
+                    encoding_iana, str(target_languages)
+                ),
+            )
+
+        cd_ratios = []
+
+        # We shall skip the CD when its about ASCII
+        # Most of the time its not relevant to run "language-detection" on it.
+        if encoding_iana != "ascii":
+            for chunk in md_chunks:
+                chunk_languages = coherence_ratio(
+                    chunk,
+                    language_threshold,
+                    ",".join(target_languages) if target_languages else None,
+                )
+
+                cd_ratios.append(chunk_languages)
+
+        cd_ratios_merged = merge_coherence_ratios(cd_ratios)
+
+        if cd_ratios_merged:
+            logger.log(
+                TRACE,
+                "We detected language {} using {}".format(
+                    cd_ratios_merged, encoding_iana
+                ),
+            )
+
+        current_match = CharsetMatch(
+            sequences,
+            encoding_iana,
+            mean_mess_ratio,
+            bom_or_sig_available,
+            cd_ratios_merged,
+            (
+                decoded_payload
+                if (
+                    is_too_large_sequence is False
+                    or encoding_iana in [specified_encoding, "ascii", "utf_8"]
+                )
+                else None
+            ),
+            preemptive_declaration=specified_encoding,
+        )
+
+        results.append(current_match)
+
+        if (
+            encoding_iana in [specified_encoding, "ascii", "utf_8"]
+            and mean_mess_ratio < 0.1
+        ):
+            # If md says nothing to worry about, then... stop immediately!
+            if mean_mess_ratio == 0.0:
+                logger.debug(
+                    "Encoding detection: %s is most likely the one.",
+                    current_match.encoding,
+                )
+                if explain:  # Defensive: ensure exit path clean handler
+                    logger.removeHandler(explain_handler)
+                    logger.setLevel(previous_logger_level)
+                return CharsetMatches([current_match])
+
+            early_stop_results.append(current_match)
+
+        if (
+            len(early_stop_results)
+            and (specified_encoding is None or specified_encoding in tested)
+            and "ascii" in tested
+            and "utf_8" in tested
+        ):
+            probable_result: CharsetMatch = early_stop_results.best()  # type: ignore[assignment]
+            logger.debug(
+                "Encoding detection: %s is most likely the one.",
+                probable_result.encoding,
+            )
+            if explain:  # Defensive: ensure exit path clean handler
+                logger.removeHandler(explain_handler)
+                logger.setLevel(previous_logger_level)
+
+            return CharsetMatches([probable_result])
+
+        if encoding_iana == sig_encoding:
+            logger.debug(
+                "Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
+                "the beginning of the sequence.",
+                encoding_iana,
+            )
+            if explain:  # Defensive: ensure exit path clean handler
+                logger.removeHandler(explain_handler)
+                logger.setLevel(previous_logger_level)
+            return CharsetMatches([results[encoding_iana]])
+
+    if len(results) == 0:
+        if fallback_u8 or fallback_ascii or fallback_specified:
+            logger.log(
+                TRACE,
+                "Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
+            )
+
+        if fallback_specified:
+            logger.debug(
+                "Encoding detection: %s will be used as a fallback match",
+                fallback_specified.encoding,
+            )
+            results.append(fallback_specified)
+        elif (
+            (fallback_u8 and fallback_ascii is None)
+            or (
+                fallback_u8
+                and fallback_ascii
+                and fallback_u8.fingerprint != fallback_ascii.fingerprint
+            )
+            or (fallback_u8 is not None)
+        ):
+            logger.debug("Encoding detection: utf_8 will be used as a fallback match")
+            results.append(fallback_u8)
+        elif fallback_ascii:
+            logger.debug("Encoding detection: ascii will be used as a fallback match")
+            results.append(fallback_ascii)
+
+    if results:
+        logger.debug(
+            "Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
+            results.best().encoding,  # type: ignore
+            len(results) - 1,
+        )
+    else:
+        logger.debug("Encoding detection: Unable to determine any suitable charset.")
+
+    if explain:
+        logger.removeHandler(explain_handler)
+        logger.setLevel(previous_logger_level)
+
+    return results
+
+
+def from_fp(
+    fp: BinaryIO,
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = True,
+) -> CharsetMatches:
+    """
+    Same thing than the function from_bytes but using a file pointer that is already ready.
+    Will not close the file pointer.
+    """
+    return from_bytes(
+        fp.read(),
+        steps,
+        chunk_size,
+        threshold,
+        cp_isolation,
+        cp_exclusion,
+        preemptive_behaviour,
+        explain,
+        language_threshold,
+        enable_fallback,
+    )
+
+
+def from_path(
+    path: str | bytes | PathLike,  # type: ignore[type-arg]
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = True,
+) -> CharsetMatches:
+    """
+    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
+    Can raise IOError.
+    """
+    with open(path, "rb") as fp:
+        return from_fp(
+            fp,
+            steps,
+            chunk_size,
+            threshold,
+            cp_isolation,
+            cp_exclusion,
+            preemptive_behaviour,
+            explain,
+            language_threshold,
+            enable_fallback,
+        )
+
+
+def is_binary(
+    fp_or_path_or_payload: PathLike | str | BinaryIO | bytes,  # type: ignore[type-arg]
+    steps: int = 5,
+    chunk_size: int = 512,
+    threshold: float = 0.20,
+    cp_isolation: list[str] | None = None,
+    cp_exclusion: list[str] | None = None,
+    preemptive_behaviour: bool = True,
+    explain: bool = False,
+    language_threshold: float = 0.1,
+    enable_fallback: bool = False,
+) -> bool:
+    """
+    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
+    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
+    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
+    """
+    if isinstance(fp_or_path_or_payload, (str, PathLike)):
+        guesses = from_path(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+    elif isinstance(
+        fp_or_path_or_payload,
+        (
+            bytes,
+            bytearray,
+        ),
+    ):
+        guesses = from_bytes(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+    else:
+        guesses = from_fp(
+            fp_or_path_or_payload,
+            steps=steps,
+            chunk_size=chunk_size,
+            threshold=threshold,
+            cp_isolation=cp_isolation,
+            cp_exclusion=cp_exclusion,
+            preemptive_behaviour=preemptive_behaviour,
+            explain=explain,
+            language_threshold=language_threshold,
+            enable_fallback=enable_fallback,
+        )
+
+    return not guesses
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/cd.py b/.venv/lib/python3.11/site-packages/charset_normalizer/cd.py
new file mode 100644
index 0000000000000000000000000000000000000000..71a3ed5197f788135bb700a15a64594780aa6337
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/cd.py
@@ -0,0 +1,395 @@
+from __future__ import annotations
+
+import importlib
+from codecs import IncrementalDecoder
+from collections import Counter
+from functools import lru_cache
+from typing import Counter as TypeCounter
+
+from .constant import (
+    FREQUENCIES,
+    KO_NAMES,
+    LANGUAGE_SUPPORTED_COUNT,
+    TOO_SMALL_SEQUENCE,
+    ZH_NAMES,
+)
+from .md import is_suspiciously_successive_range
+from .models import CoherenceMatches
+from .utils import (
+    is_accentuated,
+    is_latin,
+    is_multi_byte_encoding,
+    is_unicode_range_secondary,
+    unicode_range,
+)
+
+
+def encoding_unicode_range(iana_name: str) -> list[str]:
+    """
+    Return associated unicode ranges in a single byte code page.
+    """
+    if is_multi_byte_encoding(iana_name):
+        raise OSError("Function not supported on multi-byte code page")
+
+    decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
+
+    p: IncrementalDecoder = decoder(errors="ignore")
+    seen_ranges: dict[str, int] = {}
+    character_count: int = 0
+
+    for i in range(0x40, 0xFF):
+        chunk: str = p.decode(bytes([i]))
+
+        if chunk:
+            character_range: str | None = unicode_range(chunk)
+
+            if character_range is None:
+                continue
+
+            if is_unicode_range_secondary(character_range) is False:
+                if character_range not in seen_ranges:
+                    seen_ranges[character_range] = 0
+                seen_ranges[character_range] += 1
+                character_count += 1
+
+    return sorted(
+        [
+            character_range
+            for character_range in seen_ranges
+            if seen_ranges[character_range] / character_count >= 0.15
+        ]
+    )
+
+
+def unicode_range_languages(primary_range: str) -> list[str]:
+    """
+    Return inferred languages used with a unicode range.
+    """
+    languages: list[str] = []
+
+    for language, characters in FREQUENCIES.items():
+        for character in characters:
+            if unicode_range(character) == primary_range:
+                languages.append(language)
+                break
+
+    return languages
+
+
+@lru_cache()
+def encoding_languages(iana_name: str) -> list[str]:
+    """
+    Single-byte encoding language association. Some code page are heavily linked to particular language(s).
+    This function does the correspondence.
+    """
+    unicode_ranges: list[str] = encoding_unicode_range(iana_name)
+    primary_range: str | None = None
+
+    for specified_range in unicode_ranges:
+        if "Latin" not in specified_range:
+            primary_range = specified_range
+            break
+
+    if primary_range is None:
+        return ["Latin Based"]
+
+    return unicode_range_languages(primary_range)
+
+
+@lru_cache()
+def mb_encoding_languages(iana_name: str) -> list[str]:
+    """
+    Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
+    This function does the correspondence.
+    """
+    if (
+        iana_name.startswith("shift_")
+        or iana_name.startswith("iso2022_jp")
+        or iana_name.startswith("euc_j")
+        or iana_name == "cp932"
+    ):
+        return ["Japanese"]
+    if iana_name.startswith("gb") or iana_name in ZH_NAMES:
+        return ["Chinese"]
+    if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
+        return ["Korean"]
+
+    return []
+
+
+@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
+def get_target_features(language: str) -> tuple[bool, bool]:
+    """
+    Determine main aspects from a supported language if it contains accents and if is pure Latin.
+    """
+    target_have_accents: bool = False
+    target_pure_latin: bool = True
+
+    for character in FREQUENCIES[language]:
+        if not target_have_accents and is_accentuated(character):
+            target_have_accents = True
+        if target_pure_latin and is_latin(character) is False:
+            target_pure_latin = False
+
+    return target_have_accents, target_pure_latin
+
+
+def alphabet_languages(
+    characters: list[str], ignore_non_latin: bool = False
+) -> list[str]:
+    """
+    Return associated languages associated to given characters.
+    """
+    languages: list[tuple[str, float]] = []
+
+    source_have_accents = any(is_accentuated(character) for character in characters)
+
+    for language, language_characters in FREQUENCIES.items():
+        target_have_accents, target_pure_latin = get_target_features(language)
+
+        if ignore_non_latin and target_pure_latin is False:
+            continue
+
+        if target_have_accents is False and source_have_accents:
+            continue
+
+        character_count: int = len(language_characters)
+
+        character_match_count: int = len(
+            [c for c in language_characters if c in characters]
+        )
+
+        ratio: float = character_match_count / character_count
+
+        if ratio >= 0.2:
+            languages.append((language, ratio))
+
+    languages = sorted(languages, key=lambda x: x[1], reverse=True)
+
+    return [compatible_language[0] for compatible_language in languages]
+
+
+def characters_popularity_compare(
+    language: str, ordered_characters: list[str]
+) -> float:
+    """
+    Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
+    The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
+    Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
+    """
+    if language not in FREQUENCIES:
+        raise ValueError(f"{language} not available")
+
+    character_approved_count: int = 0
+    FREQUENCIES_language_set = set(FREQUENCIES[language])
+
+    ordered_characters_count: int = len(ordered_characters)
+    target_language_characters_count: int = len(FREQUENCIES[language])
+
+    large_alphabet: bool = target_language_characters_count > 26
+
+    for character, character_rank in zip(
+        ordered_characters, range(0, ordered_characters_count)
+    ):
+        if character not in FREQUENCIES_language_set:
+            continue
+
+        character_rank_in_language: int = FREQUENCIES[language].index(character)
+        expected_projection_ratio: float = (
+            target_language_characters_count / ordered_characters_count
+        )
+        character_rank_projection: int = int(character_rank * expected_projection_ratio)
+
+        if (
+            large_alphabet is False
+            and abs(character_rank_projection - character_rank_in_language) > 4
+        ):
+            continue
+
+        if (
+            large_alphabet is True
+            and abs(character_rank_projection - character_rank_in_language)
+            < target_language_characters_count / 3
+        ):
+            character_approved_count += 1
+            continue
+
+        characters_before_source: list[str] = FREQUENCIES[language][
+            0:character_rank_in_language
+        ]
+        characters_after_source: list[str] = FREQUENCIES[language][
+            character_rank_in_language:
+        ]
+        characters_before: list[str] = ordered_characters[0:character_rank]
+        characters_after: list[str] = ordered_characters[character_rank:]
+
+        before_match_count: int = len(
+            set(characters_before) & set(characters_before_source)
+        )
+
+        after_match_count: int = len(
+            set(characters_after) & set(characters_after_source)
+        )
+
+        if len(characters_before_source) == 0 and before_match_count <= 4:
+            character_approved_count += 1
+            continue
+
+        if len(characters_after_source) == 0 and after_match_count <= 4:
+            character_approved_count += 1
+            continue
+
+        if (
+            before_match_count / len(characters_before_source) >= 0.4
+            or after_match_count / len(characters_after_source) >= 0.4
+        ):
+            character_approved_count += 1
+            continue
+
+    return character_approved_count / len(ordered_characters)
+
+
+def alpha_unicode_split(decoded_sequence: str) -> list[str]:
+    """
+    Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
+    Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
+    One containing the latin letters and the other hebrew.
+    """
+    layers: dict[str, str] = {}
+
+    for character in decoded_sequence:
+        if character.isalpha() is False:
+            continue
+
+        character_range: str | None = unicode_range(character)
+
+        if character_range is None:
+            continue
+
+        layer_target_range: str | None = None
+
+        for discovered_range in layers:
+            if (
+                is_suspiciously_successive_range(discovered_range, character_range)
+                is False
+            ):
+                layer_target_range = discovered_range
+                break
+
+        if layer_target_range is None:
+            layer_target_range = character_range
+
+        if layer_target_range not in layers:
+            layers[layer_target_range] = character.lower()
+            continue
+
+        layers[layer_target_range] += character.lower()
+
+    return list(layers.values())
+
+
+def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
+    """
+    This function merge results previously given by the function coherence_ratio.
+    The return type is the same as coherence_ratio.
+    """
+    per_language_ratios: dict[str, list[float]] = {}
+    for result in results:
+        for sub_result in result:
+            language, ratio = sub_result
+            if language not in per_language_ratios:
+                per_language_ratios[language] = [ratio]
+                continue
+            per_language_ratios[language].append(ratio)
+
+    merge = [
+        (
+            language,
+            round(
+                sum(per_language_ratios[language]) / len(per_language_ratios[language]),
+                4,
+            ),
+        )
+        for language in per_language_ratios
+    ]
+
+    return sorted(merge, key=lambda x: x[1], reverse=True)
+
+
+def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
+    """
+    We shall NOT return "English—" in CoherenceMatches because it is an alternative
+    of "English". This function only keeps the best match and remove the em-dash in it.
+    """
+    index_results: dict[str, list[float]] = dict()
+
+    for result in results:
+        language, ratio = result
+        no_em_name: str = language.replace("—", "")
+
+        if no_em_name not in index_results:
+            index_results[no_em_name] = []
+
+        index_results[no_em_name].append(ratio)
+
+    if any(len(index_results[e]) > 1 for e in index_results):
+        filtered_results: CoherenceMatches = []
+
+        for language in index_results:
+            filtered_results.append((language, max(index_results[language])))
+
+        return filtered_results
+
+    return results
+
+
+@lru_cache(maxsize=2048)
+def coherence_ratio(
+    decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
+) -> CoherenceMatches:
+    """
+    Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
+    A layer = Character extraction by alphabets/ranges.
+    """
+
+    results: list[tuple[str, float]] = []
+    ignore_non_latin: bool = False
+
+    sufficient_match_count: int = 0
+
+    lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
+    if "Latin Based" in lg_inclusion_list:
+        ignore_non_latin = True
+        lg_inclusion_list.remove("Latin Based")
+
+    for layer in alpha_unicode_split(decoded_sequence):
+        sequence_frequencies: TypeCounter[str] = Counter(layer)
+        most_common = sequence_frequencies.most_common()
+
+        character_count: int = sum(o for c, o in most_common)
+
+        if character_count <= TOO_SMALL_SEQUENCE:
+            continue
+
+        popular_character_ordered: list[str] = [c for c, o in most_common]
+
+        for language in lg_inclusion_list or alphabet_languages(
+            popular_character_ordered, ignore_non_latin
+        ):
+            ratio: float = characters_popularity_compare(
+                language, popular_character_ordered
+            )
+
+            if ratio < threshold:
+                continue
+            elif ratio >= 0.8:
+                sufficient_match_count += 1
+
+            results.append((language, round(ratio, 4)))
+
+            if sufficient_match_count >= 3:
+                break
+
+    return sorted(
+        filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
+    )
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/constant.py b/.venv/lib/python3.11/site-packages/charset_normalizer/constant.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fb9508d2a28b9a3fa4f409eb15f88fdf1e8d54c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/constant.py
@@ -0,0 +1,1998 @@
+from __future__ import annotations
+
+from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
+from encodings.aliases import aliases
+from re import IGNORECASE
+from re import compile as re_compile
+
+# Contain for each eligible encoding a list of/item bytes SIG/BOM
+ENCODING_MARKS: dict[str, bytes | list[bytes]] = {
+    "utf_8": BOM_UTF8,
+    "utf_7": [
+        b"\x2b\x2f\x76\x38",
+        b"\x2b\x2f\x76\x39",
+        b"\x2b\x2f\x76\x2b",
+        b"\x2b\x2f\x76\x2f",
+        b"\x2b\x2f\x76\x38\x2d",
+    ],
+    "gb18030": b"\x84\x31\x95\x33",
+    "utf_32": [BOM_UTF32_BE, BOM_UTF32_LE],
+    "utf_16": [BOM_UTF16_BE, BOM_UTF16_LE],
+}
+
+TOO_SMALL_SEQUENCE: int = 32
+TOO_BIG_SEQUENCE: int = int(10e6)
+
+UTF8_MAXIMAL_ALLOCATION: int = 1_112_064
+
+# Up-to-date Unicode ucd/15.0.0
+UNICODE_RANGES_COMBINED: dict[str, range] = {
+    "Control character": range(32),
+    "Basic Latin": range(32, 128),
+    "Latin-1 Supplement": range(128, 256),
+    "Latin Extended-A": range(256, 384),
+    "Latin Extended-B": range(384, 592),
+    "IPA Extensions": range(592, 688),
+    "Spacing Modifier Letters": range(688, 768),
+    "Combining Diacritical Marks": range(768, 880),
+    "Greek and Coptic": range(880, 1024),
+    "Cyrillic": range(1024, 1280),
+    "Cyrillic Supplement": range(1280, 1328),
+    "Armenian": range(1328, 1424),
+    "Hebrew": range(1424, 1536),
+    "Arabic": range(1536, 1792),
+    "Syriac": range(1792, 1872),
+    "Arabic Supplement": range(1872, 1920),
+    "Thaana": range(1920, 1984),
+    "NKo": range(1984, 2048),
+    "Samaritan": range(2048, 2112),
+    "Mandaic": range(2112, 2144),
+    "Syriac Supplement": range(2144, 2160),
+    "Arabic Extended-B": range(2160, 2208),
+    "Arabic Extended-A": range(2208, 2304),
+    "Devanagari": range(2304, 2432),
+    "Bengali": range(2432, 2560),
+    "Gurmukhi": range(2560, 2688),
+    "Gujarati": range(2688, 2816),
+    "Oriya": range(2816, 2944),
+    "Tamil": range(2944, 3072),
+    "Telugu": range(3072, 3200),
+    "Kannada": range(3200, 3328),
+    "Malayalam": range(3328, 3456),
+    "Sinhala": range(3456, 3584),
+    "Thai": range(3584, 3712),
+    "Lao": range(3712, 3840),
+    "Tibetan": range(3840, 4096),
+    "Myanmar": range(4096, 4256),
+    "Georgian": range(4256, 4352),
+    "Hangul Jamo": range(4352, 4608),
+    "Ethiopic": range(4608, 4992),
+    "Ethiopic Supplement": range(4992, 5024),
+    "Cherokee": range(5024, 5120),
+    "Unified Canadian Aboriginal Syllabics": range(5120, 5760),
+    "Ogham": range(5760, 5792),
+    "Runic": range(5792, 5888),
+    "Tagalog": range(5888, 5920),
+    "Hanunoo": range(5920, 5952),
+    "Buhid": range(5952, 5984),
+    "Tagbanwa": range(5984, 6016),
+    "Khmer": range(6016, 6144),
+    "Mongolian": range(6144, 6320),
+    "Unified Canadian Aboriginal Syllabics Extended": range(6320, 6400),
+    "Limbu": range(6400, 6480),
+    "Tai Le": range(6480, 6528),
+    "New Tai Lue": range(6528, 6624),
+    "Khmer Symbols": range(6624, 6656),
+    "Buginese": range(6656, 6688),
+    "Tai Tham": range(6688, 6832),
+    "Combining Diacritical Marks Extended": range(6832, 6912),
+    "Balinese": range(6912, 7040),
+    "Sundanese": range(7040, 7104),
+    "Batak": range(7104, 7168),
+    "Lepcha": range(7168, 7248),
+    "Ol Chiki": range(7248, 7296),
+    "Cyrillic Extended-C": range(7296, 7312),
+    "Georgian Extended": range(7312, 7360),
+    "Sundanese Supplement": range(7360, 7376),
+    "Vedic Extensions": range(7376, 7424),
+    "Phonetic Extensions": range(7424, 7552),
+    "Phonetic Extensions Supplement": range(7552, 7616),
+    "Combining Diacritical Marks Supplement": range(7616, 7680),
+    "Latin Extended Additional": range(7680, 7936),
+    "Greek Extended": range(7936, 8192),
+    "General Punctuation": range(8192, 8304),
+    "Superscripts and Subscripts": range(8304, 8352),
+    "Currency Symbols": range(8352, 8400),
+    "Combining Diacritical Marks for Symbols": range(8400, 8448),
+    "Letterlike Symbols": range(8448, 8528),
+    "Number Forms": range(8528, 8592),
+    "Arrows": range(8592, 8704),
+    "Mathematical Operators": range(8704, 8960),
+    "Miscellaneous Technical": range(8960, 9216),
+    "Control Pictures": range(9216, 9280),
+    "Optical Character Recognition": range(9280, 9312),
+    "Enclosed Alphanumerics": range(9312, 9472),
+    "Box Drawing": range(9472, 9600),
+    "Block Elements": range(9600, 9632),
+    "Geometric Shapes": range(9632, 9728),
+    "Miscellaneous Symbols": range(9728, 9984),
+    "Dingbats": range(9984, 10176),
+    "Miscellaneous Mathematical Symbols-A": range(10176, 10224),
+    "Supplemental Arrows-A": range(10224, 10240),
+    "Braille Patterns": range(10240, 10496),
+    "Supplemental Arrows-B": range(10496, 10624),
+    "Miscellaneous Mathematical Symbols-B": range(10624, 10752),
+    "Supplemental Mathematical Operators": range(10752, 11008),
+    "Miscellaneous Symbols and Arrows": range(11008, 11264),
+    "Glagolitic": range(11264, 11360),
+    "Latin Extended-C": range(11360, 11392),
+    "Coptic": range(11392, 11520),
+    "Georgian Supplement": range(11520, 11568),
+    "Tifinagh": range(11568, 11648),
+    "Ethiopic Extended": range(11648, 11744),
+    "Cyrillic Extended-A": range(11744, 11776),
+    "Supplemental Punctuation": range(11776, 11904),
+    "CJK Radicals Supplement": range(11904, 12032),
+    "Kangxi Radicals": range(12032, 12256),
+    "Ideographic Description Characters": range(12272, 12288),
+    "CJK Symbols and Punctuation": range(12288, 12352),
+    "Hiragana": range(12352, 12448),
+    "Katakana": range(12448, 12544),
+    "Bopomofo": range(12544, 12592),
+    "Hangul Compatibility Jamo": range(12592, 12688),
+    "Kanbun": range(12688, 12704),
+    "Bopomofo Extended": range(12704, 12736),
+    "CJK Strokes": range(12736, 12784),
+    "Katakana Phonetic Extensions": range(12784, 12800),
+    "Enclosed CJK Letters and Months": range(12800, 13056),
+    "CJK Compatibility": range(13056, 13312),
+    "CJK Unified Ideographs Extension A": range(13312, 19904),
+    "Yijing Hexagram Symbols": range(19904, 19968),
+    "CJK Unified Ideographs": range(19968, 40960),
+    "Yi Syllables": range(40960, 42128),
+    "Yi Radicals": range(42128, 42192),
+    "Lisu": range(42192, 42240),
+    "Vai": range(42240, 42560),
+    "Cyrillic Extended-B": range(42560, 42656),
+    "Bamum": range(42656, 42752),
+    "Modifier Tone Letters": range(42752, 42784),
+    "Latin Extended-D": range(42784, 43008),
+    "Syloti Nagri": range(43008, 43056),
+    "Common Indic Number Forms": range(43056, 43072),
+    "Phags-pa": range(43072, 43136),
+    "Saurashtra": range(43136, 43232),
+    "Devanagari Extended": range(43232, 43264),
+    "Kayah Li": range(43264, 43312),
+    "Rejang": range(43312, 43360),
+    "Hangul Jamo Extended-A": range(43360, 43392),
+    "Javanese": range(43392, 43488),
+    "Myanmar Extended-B": range(43488, 43520),
+    "Cham": range(43520, 43616),
+    "Myanmar Extended-A": range(43616, 43648),
+    "Tai Viet": range(43648, 43744),
+    "Meetei Mayek Extensions": range(43744, 43776),
+    "Ethiopic Extended-A": range(43776, 43824),
+    "Latin Extended-E": range(43824, 43888),
+    "Cherokee Supplement": range(43888, 43968),
+    "Meetei Mayek": range(43968, 44032),
+    "Hangul Syllables": range(44032, 55216),
+    "Hangul Jamo Extended-B": range(55216, 55296),
+    "High Surrogates": range(55296, 56192),
+    "High Private Use Surrogates": range(56192, 56320),
+    "Low Surrogates": range(56320, 57344),
+    "Private Use Area": range(57344, 63744),
+    "CJK Compatibility Ideographs": range(63744, 64256),
+    "Alphabetic Presentation Forms": range(64256, 64336),
+    "Arabic Presentation Forms-A": range(64336, 65024),
+    "Variation Selectors": range(65024, 65040),
+    "Vertical Forms": range(65040, 65056),
+    "Combining Half Marks": range(65056, 65072),
+    "CJK Compatibility Forms": range(65072, 65104),
+    "Small Form Variants": range(65104, 65136),
+    "Arabic Presentation Forms-B": range(65136, 65280),
+    "Halfwidth and Fullwidth Forms": range(65280, 65520),
+    "Specials": range(65520, 65536),
+    "Linear B Syllabary": range(65536, 65664),
+    "Linear B Ideograms": range(65664, 65792),
+    "Aegean Numbers": range(65792, 65856),
+    "Ancient Greek Numbers": range(65856, 65936),
+    "Ancient Symbols": range(65936, 66000),
+    "Phaistos Disc": range(66000, 66048),
+    "Lycian": range(66176, 66208),
+    "Carian": range(66208, 66272),
+    "Coptic Epact Numbers": range(66272, 66304),
+    "Old Italic": range(66304, 66352),
+    "Gothic": range(66352, 66384),
+    "Old Permic": range(66384, 66432),
+    "Ugaritic": range(66432, 66464),
+    "Old Persian": range(66464, 66528),
+    "Deseret": range(66560, 66640),
+    "Shavian": range(66640, 66688),
+    "Osmanya": range(66688, 66736),
+    "Osage": range(66736, 66816),
+    "Elbasan": range(66816, 66864),
+    "Caucasian Albanian": range(66864, 66928),
+    "Vithkuqi": range(66928, 67008),
+    "Linear A": range(67072, 67456),
+    "Latin Extended-F": range(67456, 67520),
+    "Cypriot Syllabary": range(67584, 67648),
+    "Imperial Aramaic": range(67648, 67680),
+    "Palmyrene": range(67680, 67712),
+    "Nabataean": range(67712, 67760),
+    "Hatran": range(67808, 67840),
+    "Phoenician": range(67840, 67872),
+    "Lydian": range(67872, 67904),
+    "Meroitic Hieroglyphs": range(67968, 68000),
+    "Meroitic Cursive": range(68000, 68096),
+    "Kharoshthi": range(68096, 68192),
+    "Old South Arabian": range(68192, 68224),
+    "Old North Arabian": range(68224, 68256),
+    "Manichaean": range(68288, 68352),
+    "Avestan": range(68352, 68416),
+    "Inscriptional Parthian": range(68416, 68448),
+    "Inscriptional Pahlavi": range(68448, 68480),
+    "Psalter Pahlavi": range(68480, 68528),
+    "Old Turkic": range(68608, 68688),
+    "Old Hungarian": range(68736, 68864),
+    "Hanifi Rohingya": range(68864, 68928),
+    "Rumi Numeral Symbols": range(69216, 69248),
+    "Yezidi": range(69248, 69312),
+    "Arabic Extended-C": range(69312, 69376),
+    "Old Sogdian": range(69376, 69424),
+    "Sogdian": range(69424, 69488),
+    "Old Uyghur": range(69488, 69552),
+    "Chorasmian": range(69552, 69600),
+    "Elymaic": range(69600, 69632),
+    "Brahmi": range(69632, 69760),
+    "Kaithi": range(69760, 69840),
+    "Sora Sompeng": range(69840, 69888),
+    "Chakma": range(69888, 69968),
+    "Mahajani": range(69968, 70016),
+    "Sharada": range(70016, 70112),
+    "Sinhala Archaic Numbers": range(70112, 70144),
+    "Khojki": range(70144, 70224),
+    "Multani": range(70272, 70320),
+    "Khudawadi": range(70320, 70400),
+    "Grantha": range(70400, 70528),
+    "Newa": range(70656, 70784),
+    "Tirhuta": range(70784, 70880),
+    "Siddham": range(71040, 71168),
+    "Modi": range(71168, 71264),
+    "Mongolian Supplement": range(71264, 71296),
+    "Takri": range(71296, 71376),
+    "Ahom": range(71424, 71504),
+    "Dogra": range(71680, 71760),
+    "Warang Citi": range(71840, 71936),
+    "Dives Akuru": range(71936, 72032),
+    "Nandinagari": range(72096, 72192),
+    "Zanabazar Square": range(72192, 72272),
+    "Soyombo": range(72272, 72368),
+    "Unified Canadian Aboriginal Syllabics Extended-A": range(72368, 72384),
+    "Pau Cin Hau": range(72384, 72448),
+    "Devanagari Extended-A": range(72448, 72544),
+    "Bhaiksuki": range(72704, 72816),
+    "Marchen": range(72816, 72896),
+    "Masaram Gondi": range(72960, 73056),
+    "Gunjala Gondi": range(73056, 73136),
+    "Makasar": range(73440, 73472),
+    "Kawi": range(73472, 73568),
+    "Lisu Supplement": range(73648, 73664),
+    "Tamil Supplement": range(73664, 73728),
+    "Cuneiform": range(73728, 74752),
+    "Cuneiform Numbers and Punctuation": range(74752, 74880),
+    "Early Dynastic Cuneiform": range(74880, 75088),
+    "Cypro-Minoan": range(77712, 77824),
+    "Egyptian Hieroglyphs": range(77824, 78896),
+    "Egyptian Hieroglyph Format Controls": range(78896, 78944),
+    "Anatolian Hieroglyphs": range(82944, 83584),
+    "Bamum Supplement": range(92160, 92736),
+    "Mro": range(92736, 92784),
+    "Tangsa": range(92784, 92880),
+    "Bassa Vah": range(92880, 92928),
+    "Pahawh Hmong": range(92928, 93072),
+    "Medefaidrin": range(93760, 93856),
+    "Miao": range(93952, 94112),
+    "Ideographic Symbols and Punctuation": range(94176, 94208),
+    "Tangut": range(94208, 100352),
+    "Tangut Components": range(100352, 101120),
+    "Khitan Small Script": range(101120, 101632),
+    "Tangut Supplement": range(101632, 101760),
+    "Kana Extended-B": range(110576, 110592),
+    "Kana Supplement": range(110592, 110848),
+    "Kana Extended-A": range(110848, 110896),
+    "Small Kana Extension": range(110896, 110960),
+    "Nushu": range(110960, 111360),
+    "Duployan": range(113664, 113824),
+    "Shorthand Format Controls": range(113824, 113840),
+    "Znamenny Musical Notation": range(118528, 118736),
+    "Byzantine Musical Symbols": range(118784, 119040),
+    "Musical Symbols": range(119040, 119296),
+    "Ancient Greek Musical Notation": range(119296, 119376),
+    "Kaktovik Numerals": range(119488, 119520),
+    "Mayan Numerals": range(119520, 119552),
+    "Tai Xuan Jing Symbols": range(119552, 119648),
+    "Counting Rod Numerals": range(119648, 119680),
+    "Mathematical Alphanumeric Symbols": range(119808, 120832),
+    "Sutton SignWriting": range(120832, 121520),
+    "Latin Extended-G": range(122624, 122880),
+    "Glagolitic Supplement": range(122880, 122928),
+    "Cyrillic Extended-D": range(122928, 123024),
+    "Nyiakeng Puachue Hmong": range(123136, 123216),
+    "Toto": range(123536, 123584),
+    "Wancho": range(123584, 123648),
+    "Nag Mundari": range(124112, 124160),
+    "Ethiopic Extended-B": range(124896, 124928),
+    "Mende Kikakui": range(124928, 125152),
+    "Adlam": range(125184, 125280),
+    "Indic Siyaq Numbers": range(126064, 126144),
+    "Ottoman Siyaq Numbers": range(126208, 126288),
+    "Arabic Mathematical Alphabetic Symbols": range(126464, 126720),
+    "Mahjong Tiles": range(126976, 127024),
+    "Domino Tiles": range(127024, 127136),
+    "Playing Cards": range(127136, 127232),
+    "Enclosed Alphanumeric Supplement": range(127232, 127488),
+    "Enclosed Ideographic Supplement": range(127488, 127744),
+    "Miscellaneous Symbols and Pictographs": range(127744, 128512),
+    "Emoticons range(Emoji)": range(128512, 128592),
+    "Ornamental Dingbats": range(128592, 128640),
+    "Transport and Map Symbols": range(128640, 128768),
+    "Alchemical Symbols": range(128768, 128896),
+    "Geometric Shapes Extended": range(128896, 129024),
+    "Supplemental Arrows-C": range(129024, 129280),
+    "Supplemental Symbols and Pictographs": range(129280, 129536),
+    "Chess Symbols": range(129536, 129648),
+    "Symbols and Pictographs Extended-A": range(129648, 129792),
+    "Symbols for Legacy Computing": range(129792, 130048),
+    "CJK Unified Ideographs Extension B": range(131072, 173792),
+    "CJK Unified Ideographs Extension C": range(173824, 177984),
+    "CJK Unified Ideographs Extension D": range(177984, 178208),
+    "CJK Unified Ideographs Extension E": range(178208, 183984),
+    "CJK Unified Ideographs Extension F": range(183984, 191472),
+    "CJK Compatibility Ideographs Supplement": range(194560, 195104),
+    "CJK Unified Ideographs Extension G": range(196608, 201552),
+    "CJK Unified Ideographs Extension H": range(201552, 205744),
+    "Tags": range(917504, 917632),
+    "Variation Selectors Supplement": range(917760, 918000),
+    "Supplementary Private Use Area-A": range(983040, 1048576),
+    "Supplementary Private Use Area-B": range(1048576, 1114112),
+}
+
+
+UNICODE_SECONDARY_RANGE_KEYWORD: list[str] = [
+    "Supplement",
+    "Extended",
+    "Extensions",
+    "Modifier",
+    "Marks",
+    "Punctuation",
+    "Symbols",
+    "Forms",
+    "Operators",
+    "Miscellaneous",
+    "Drawing",
+    "Block",
+    "Shapes",
+    "Supplemental",
+    "Tags",
+]
+
+RE_POSSIBLE_ENCODING_INDICATION = re_compile(
+    r"(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)",
+    IGNORECASE,
+)
+
+IANA_NO_ALIASES = [
+    "cp720",
+    "cp737",
+    "cp856",
+    "cp874",
+    "cp875",
+    "cp1006",
+    "koi8_r",
+    "koi8_t",
+    "koi8_u",
+]
+
+IANA_SUPPORTED: list[str] = sorted(
+    filter(
+        lambda x: x.endswith("_codec") is False
+        and x not in {"rot_13", "tactis", "mbcs"},
+        list(set(aliases.values())) + IANA_NO_ALIASES,
+    )
+)
+
+IANA_SUPPORTED_COUNT: int = len(IANA_SUPPORTED)
+
+# pre-computed code page that are similar using the function cp_similarity.
+IANA_SUPPORTED_SIMILAR: dict[str, list[str]] = {
+    "cp037": ["cp1026", "cp1140", "cp273", "cp500"],
+    "cp1026": ["cp037", "cp1140", "cp273", "cp500"],
+    "cp1125": ["cp866"],
+    "cp1140": ["cp037", "cp1026", "cp273", "cp500"],
+    "cp1250": ["iso8859_2"],
+    "cp1251": ["kz1048", "ptcp154"],
+    "cp1252": ["iso8859_15", "iso8859_9", "latin_1"],
+    "cp1253": ["iso8859_7"],
+    "cp1254": ["iso8859_15", "iso8859_9", "latin_1"],
+    "cp1257": ["iso8859_13"],
+    "cp273": ["cp037", "cp1026", "cp1140", "cp500"],
+    "cp437": ["cp850", "cp858", "cp860", "cp861", "cp862", "cp863", "cp865"],
+    "cp500": ["cp037", "cp1026", "cp1140", "cp273"],
+    "cp850": ["cp437", "cp857", "cp858", "cp865"],
+    "cp857": ["cp850", "cp858", "cp865"],
+    "cp858": ["cp437", "cp850", "cp857", "cp865"],
+    "cp860": ["cp437", "cp861", "cp862", "cp863", "cp865"],
+    "cp861": ["cp437", "cp860", "cp862", "cp863", "cp865"],
+    "cp862": ["cp437", "cp860", "cp861", "cp863", "cp865"],
+    "cp863": ["cp437", "cp860", "cp861", "cp862", "cp865"],
+    "cp865": ["cp437", "cp850", "cp857", "cp858", "cp860", "cp861", "cp862", "cp863"],
+    "cp866": ["cp1125"],
+    "iso8859_10": ["iso8859_14", "iso8859_15", "iso8859_4", "iso8859_9", "latin_1"],
+    "iso8859_11": ["tis_620"],
+    "iso8859_13": ["cp1257"],
+    "iso8859_14": [
+        "iso8859_10",
+        "iso8859_15",
+        "iso8859_16",
+        "iso8859_3",
+        "iso8859_9",
+        "latin_1",
+    ],
+    "iso8859_15": [
+        "cp1252",
+        "cp1254",
+        "iso8859_10",
+        "iso8859_14",
+        "iso8859_16",
+        "iso8859_3",
+        "iso8859_9",
+        "latin_1",
+    ],
+    "iso8859_16": [
+        "iso8859_14",
+        "iso8859_15",
+        "iso8859_2",
+        "iso8859_3",
+        "iso8859_9",
+        "latin_1",
+    ],
+    "iso8859_2": ["cp1250", "iso8859_16", "iso8859_4"],
+    "iso8859_3": ["iso8859_14", "iso8859_15", "iso8859_16", "iso8859_9", "latin_1"],
+    "iso8859_4": ["iso8859_10", "iso8859_2", "iso8859_9", "latin_1"],
+    "iso8859_7": ["cp1253"],
+    "iso8859_9": [
+        "cp1252",
+        "cp1254",
+        "cp1258",
+        "iso8859_10",
+        "iso8859_14",
+        "iso8859_15",
+        "iso8859_16",
+        "iso8859_3",
+        "iso8859_4",
+        "latin_1",
+    ],
+    "kz1048": ["cp1251", "ptcp154"],
+    "latin_1": [
+        "cp1252",
+        "cp1254",
+        "cp1258",
+        "iso8859_10",
+        "iso8859_14",
+        "iso8859_15",
+        "iso8859_16",
+        "iso8859_3",
+        "iso8859_4",
+        "iso8859_9",
+    ],
+    "mac_iceland": ["mac_roman", "mac_turkish"],
+    "mac_roman": ["mac_iceland", "mac_turkish"],
+    "mac_turkish": ["mac_iceland", "mac_roman"],
+    "ptcp154": ["cp1251", "kz1048"],
+    "tis_620": ["iso8859_11"],
+}
+
+
+CHARDET_CORRESPONDENCE: dict[str, str] = {
+    "iso2022_kr": "ISO-2022-KR",
+    "iso2022_jp": "ISO-2022-JP",
+    "euc_kr": "EUC-KR",
+    "tis_620": "TIS-620",
+    "utf_32": "UTF-32",
+    "euc_jp": "EUC-JP",
+    "koi8_r": "KOI8-R",
+    "iso8859_1": "ISO-8859-1",
+    "iso8859_2": "ISO-8859-2",
+    "iso8859_5": "ISO-8859-5",
+    "iso8859_6": "ISO-8859-6",
+    "iso8859_7": "ISO-8859-7",
+    "iso8859_8": "ISO-8859-8",
+    "utf_16": "UTF-16",
+    "cp855": "IBM855",
+    "mac_cyrillic": "MacCyrillic",
+    "gb2312": "GB2312",
+    "gb18030": "GB18030",
+    "cp932": "CP932",
+    "cp866": "IBM866",
+    "utf_8": "utf-8",
+    "utf_8_sig": "UTF-8-SIG",
+    "shift_jis": "SHIFT_JIS",
+    "big5": "Big5",
+    "cp1250": "windows-1250",
+    "cp1251": "windows-1251",
+    "cp1252": "Windows-1252",
+    "cp1253": "windows-1253",
+    "cp1255": "windows-1255",
+    "cp1256": "windows-1256",
+    "cp1254": "Windows-1254",
+    "cp949": "CP949",
+}
+
+
+COMMON_SAFE_ASCII_CHARACTERS: set[str] = {
+    "<",
+    ">",
+    "=",
+    ":",
+    "/",
+    "&",
+    ";",
+    "{",
+    "}",
+    "[",
+    "]",
+    ",",
+    "|",
+    '"',
+    "-",
+    "(",
+    ")",
+}
+
+
+KO_NAMES: set[str] = {"johab", "cp949", "euc_kr"}
+ZH_NAMES: set[str] = {"big5", "cp950", "big5hkscs", "hz"}
+
+# Logging LEVEL below DEBUG
+TRACE: int = 5
+
+
+# Language label that contain the em dash "—"
+# character are to be considered alternative seq to origin
+FREQUENCIES: dict[str, list[str]] = {
+    "English": [
+        "e",
+        "a",
+        "t",
+        "i",
+        "o",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "u",
+        "m",
+        "f",
+        "p",
+        "g",
+        "w",
+        "y",
+        "b",
+        "v",
+        "k",
+        "x",
+        "j",
+        "z",
+        "q",
+    ],
+    "English—": [
+        "e",
+        "a",
+        "t",
+        "i",
+        "o",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "m",
+        "u",
+        "f",
+        "p",
+        "g",
+        "w",
+        "b",
+        "y",
+        "v",
+        "k",
+        "j",
+        "x",
+        "z",
+        "q",
+    ],
+    "German": [
+        "e",
+        "n",
+        "i",
+        "r",
+        "s",
+        "t",
+        "a",
+        "d",
+        "h",
+        "u",
+        "l",
+        "g",
+        "o",
+        "c",
+        "m",
+        "b",
+        "f",
+        "k",
+        "w",
+        "z",
+        "p",
+        "v",
+        "ü",
+        "ä",
+        "ö",
+        "j",
+    ],
+    "French": [
+        "e",
+        "a",
+        "s",
+        "n",
+        "i",
+        "t",
+        "r",
+        "l",
+        "u",
+        "o",
+        "d",
+        "c",
+        "p",
+        "m",
+        "é",
+        "v",
+        "g",
+        "f",
+        "b",
+        "h",
+        "q",
+        "à",
+        "x",
+        "è",
+        "y",
+        "j",
+    ],
+    "Dutch": [
+        "e",
+        "n",
+        "a",
+        "i",
+        "r",
+        "t",
+        "o",
+        "d",
+        "s",
+        "l",
+        "g",
+        "h",
+        "v",
+        "m",
+        "u",
+        "k",
+        "c",
+        "p",
+        "b",
+        "w",
+        "j",
+        "z",
+        "f",
+        "y",
+        "x",
+        "ë",
+    ],
+    "Italian": [
+        "e",
+        "i",
+        "a",
+        "o",
+        "n",
+        "l",
+        "t",
+        "r",
+        "s",
+        "c",
+        "d",
+        "u",
+        "p",
+        "m",
+        "g",
+        "v",
+        "f",
+        "b",
+        "z",
+        "h",
+        "q",
+        "è",
+        "à",
+        "k",
+        "y",
+        "ò",
+    ],
+    "Polish": [
+        "a",
+        "i",
+        "o",
+        "e",
+        "n",
+        "r",
+        "z",
+        "w",
+        "s",
+        "c",
+        "t",
+        "k",
+        "y",
+        "d",
+        "p",
+        "m",
+        "u",
+        "l",
+        "j",
+        "ł",
+        "g",
+        "b",
+        "h",
+        "ą",
+        "ę",
+        "ó",
+    ],
+    "Spanish": [
+        "e",
+        "a",
+        "o",
+        "n",
+        "s",
+        "r",
+        "i",
+        "l",
+        "d",
+        "t",
+        "c",
+        "u",
+        "m",
+        "p",
+        "b",
+        "g",
+        "v",
+        "f",
+        "y",
+        "ó",
+        "h",
+        "q",
+        "í",
+        "j",
+        "z",
+        "á",
+    ],
+    "Russian": [
+        "о",
+        "а",
+        "е",
+        "и",
+        "н",
+        "с",
+        "т",
+        "р",
+        "в",
+        "л",
+        "к",
+        "м",
+        "д",
+        "п",
+        "у",
+        "г",
+        "я",
+        "ы",
+        "з",
+        "б",
+        "й",
+        "ь",
+        "ч",
+        "х",
+        "ж",
+        "ц",
+    ],
+    # Jap-Kanji
+    "Japanese": [
+        "人",
+        "一",
+        "大",
+        "亅",
+        "丁",
+        "丨",
+        "竹",
+        "笑",
+        "口",
+        "日",
+        "今",
+        "二",
+        "彳",
+        "行",
+        "十",
+        "土",
+        "丶",
+        "寸",
+        "寺",
+        "時",
+        "乙",
+        "丿",
+        "乂",
+        "气",
+        "気",
+        "冂",
+        "巾",
+        "亠",
+        "市",
+        "目",
+        "儿",
+        "見",
+        "八",
+        "小",
+        "凵",
+        "県",
+        "月",
+        "彐",
+        "門",
+        "間",
+        "木",
+        "東",
+        "山",
+        "出",
+        "本",
+        "中",
+        "刀",
+        "分",
+        "耳",
+        "又",
+        "取",
+        "最",
+        "言",
+        "田",
+        "心",
+        "思",
+        "刂",
+        "前",
+        "京",
+        "尹",
+        "事",
+        "生",
+        "厶",
+        "云",
+        "会",
+        "未",
+        "来",
+        "白",
+        "冫",
+        "楽",
+        "灬",
+        "馬",
+        "尸",
+        "尺",
+        "駅",
+        "明",
+        "耂",
+        "者",
+        "了",
+        "阝",
+        "都",
+        "高",
+        "卜",
+        "占",
+        "厂",
+        "广",
+        "店",
+        "子",
+        "申",
+        "奄",
+        "亻",
+        "俺",
+        "上",
+        "方",
+        "冖",
+        "学",
+        "衣",
+        "艮",
+        "食",
+        "自",
+    ],
+    # Jap-Katakana
+    "Japanese—": [
+        "ー",
+        "ン",
+        "ス",
+        "・",
+        "ル",
+        "ト",
+        "リ",
+        "イ",
+        "ア",
+        "ラ",
+        "ッ",
+        "ク",
+        "ド",
+        "シ",
+        "レ",
+        "ジ",
+        "タ",
+        "フ",
+        "ロ",
+        "カ",
+        "テ",
+        "マ",
+        "ィ",
+        "グ",
+        "バ",
+        "ム",
+        "プ",
+        "オ",
+        "コ",
+        "デ",
+        "ニ",
+        "ウ",
+        "メ",
+        "サ",
+        "ビ",
+        "ナ",
+        "ブ",
+        "ャ",
+        "エ",
+        "ュ",
+        "チ",
+        "キ",
+        "ズ",
+        "ダ",
+        "パ",
+        "ミ",
+        "ェ",
+        "ョ",
+        "ハ",
+        "セ",
+        "ベ",
+        "ガ",
+        "モ",
+        "ツ",
+        "ネ",
+        "ボ",
+        "ソ",
+        "ノ",
+        "ァ",
+        "ヴ",
+        "ワ",
+        "ポ",
+        "ペ",
+        "ピ",
+        "ケ",
+        "ゴ",
+        "ギ",
+        "ザ",
+        "ホ",
+        "ゲ",
+        "ォ",
+        "ヤ",
+        "ヒ",
+        "ユ",
+        "ヨ",
+        "ヘ",
+        "ゼ",
+        "ヌ",
+        "ゥ",
+        "ゾ",
+        "ヶ",
+        "ヂ",
+        "ヲ",
+        "ヅ",
+        "ヵ",
+        "ヱ",
+        "ヰ",
+        "ヮ",
+        "ヽ",
+        "゠",
+        "ヾ",
+        "ヷ",
+        "ヿ",
+        "ヸ",
+        "ヹ",
+        "ヺ",
+    ],
+    # Jap-Hiragana
+    "Japanese——": [
+        "の",
+        "に",
+        "る",
+        "た",
+        "と",
+        "は",
+        "し",
+        "い",
+        "を",
+        "で",
+        "て",
+        "が",
+        "な",
+        "れ",
+        "か",
+        "ら",
+        "さ",
+        "っ",
+        "り",
+        "す",
+        "あ",
+        "も",
+        "こ",
+        "ま",
+        "う",
+        "く",
+        "よ",
+        "き",
+        "ん",
+        "め",
+        "お",
+        "け",
+        "そ",
+        "つ",
+        "だ",
+        "や",
+        "え",
+        "ど",
+        "わ",
+        "ち",
+        "み",
+        "せ",
+        "じ",
+        "ば",
+        "へ",
+        "び",
+        "ず",
+        "ろ",
+        "ほ",
+        "げ",
+        "む",
+        "べ",
+        "ひ",
+        "ょ",
+        "ゆ",
+        "ぶ",
+        "ご",
+        "ゃ",
+        "ね",
+        "ふ",
+        "ぐ",
+        "ぎ",
+        "ぼ",
+        "ゅ",
+        "づ",
+        "ざ",
+        "ぞ",
+        "ぬ",
+        "ぜ",
+        "ぱ",
+        "ぽ",
+        "ぷ",
+        "ぴ",
+        "ぃ",
+        "ぁ",
+        "ぇ",
+        "ぺ",
+        "ゞ",
+        "ぢ",
+        "ぉ",
+        "ぅ",
+        "ゐ",
+        "ゝ",
+        "ゑ",
+        "゛",
+        "゜",
+        "ゎ",
+        "ゔ",
+        "゚",
+        "ゟ",
+        "゙",
+        "ゕ",
+        "ゖ",
+    ],
+    "Portuguese": [
+        "a",
+        "e",
+        "o",
+        "s",
+        "i",
+        "r",
+        "d",
+        "n",
+        "t",
+        "m",
+        "u",
+        "c",
+        "l",
+        "p",
+        "g",
+        "v",
+        "b",
+        "f",
+        "h",
+        "ã",
+        "q",
+        "é",
+        "ç",
+        "á",
+        "z",
+        "í",
+    ],
+    "Swedish": [
+        "e",
+        "a",
+        "n",
+        "r",
+        "t",
+        "s",
+        "i",
+        "l",
+        "d",
+        "o",
+        "m",
+        "k",
+        "g",
+        "v",
+        "h",
+        "f",
+        "u",
+        "p",
+        "ä",
+        "c",
+        "b",
+        "ö",
+        "å",
+        "y",
+        "j",
+        "x",
+    ],
+    "Chinese": [
+        "的",
+        "一",
+        "是",
+        "不",
+        "了",
+        "在",
+        "人",
+        "有",
+        "我",
+        "他",
+        "这",
+        "个",
+        "们",
+        "中",
+        "来",
+        "上",
+        "大",
+        "为",
+        "和",
+        "国",
+        "地",
+        "到",
+        "以",
+        "说",
+        "时",
+        "要",
+        "就",
+        "出",
+        "会",
+        "可",
+        "也",
+        "你",
+        "对",
+        "生",
+        "能",
+        "而",
+        "子",
+        "那",
+        "得",
+        "于",
+        "着",
+        "下",
+        "自",
+        "之",
+        "年",
+        "过",
+        "发",
+        "后",
+        "作",
+        "里",
+        "用",
+        "道",
+        "行",
+        "所",
+        "然",
+        "家",
+        "种",
+        "事",
+        "成",
+        "方",
+        "多",
+        "经",
+        "么",
+        "去",
+        "法",
+        "学",
+        "如",
+        "都",
+        "同",
+        "现",
+        "当",
+        "没",
+        "动",
+        "面",
+        "起",
+        "看",
+        "定",
+        "天",
+        "分",
+        "还",
+        "进",
+        "好",
+        "小",
+        "部",
+        "其",
+        "些",
+        "主",
+        "样",
+        "理",
+        "心",
+        "她",
+        "本",
+        "前",
+        "开",
+        "但",
+        "因",
+        "只",
+        "从",
+        "想",
+        "实",
+    ],
+    "Ukrainian": [
+        "о",
+        "а",
+        "н",
+        "і",
+        "и",
+        "р",
+        "в",
+        "т",
+        "е",
+        "с",
+        "к",
+        "л",
+        "у",
+        "д",
+        "м",
+        "п",
+        "з",
+        "я",
+        "ь",
+        "б",
+        "г",
+        "й",
+        "ч",
+        "х",
+        "ц",
+        "ї",
+    ],
+    "Norwegian": [
+        "e",
+        "r",
+        "n",
+        "t",
+        "a",
+        "s",
+        "i",
+        "o",
+        "l",
+        "d",
+        "g",
+        "k",
+        "m",
+        "v",
+        "f",
+        "p",
+        "u",
+        "b",
+        "h",
+        "å",
+        "y",
+        "j",
+        "ø",
+        "c",
+        "æ",
+        "w",
+    ],
+    "Finnish": [
+        "a",
+        "i",
+        "n",
+        "t",
+        "e",
+        "s",
+        "l",
+        "o",
+        "u",
+        "k",
+        "ä",
+        "m",
+        "r",
+        "v",
+        "j",
+        "h",
+        "p",
+        "y",
+        "d",
+        "ö",
+        "g",
+        "c",
+        "b",
+        "f",
+        "w",
+        "z",
+    ],
+    "Vietnamese": [
+        "n",
+        "h",
+        "t",
+        "i",
+        "c",
+        "g",
+        "a",
+        "o",
+        "u",
+        "m",
+        "l",
+        "r",
+        "à",
+        "đ",
+        "s",
+        "e",
+        "v",
+        "p",
+        "b",
+        "y",
+        "ư",
+        "d",
+        "á",
+        "k",
+        "ộ",
+        "ế",
+    ],
+    "Czech": [
+        "o",
+        "e",
+        "a",
+        "n",
+        "t",
+        "s",
+        "i",
+        "l",
+        "v",
+        "r",
+        "k",
+        "d",
+        "u",
+        "m",
+        "p",
+        "í",
+        "c",
+        "h",
+        "z",
+        "á",
+        "y",
+        "j",
+        "b",
+        "ě",
+        "é",
+        "ř",
+    ],
+    "Hungarian": [
+        "e",
+        "a",
+        "t",
+        "l",
+        "s",
+        "n",
+        "k",
+        "r",
+        "i",
+        "o",
+        "z",
+        "á",
+        "é",
+        "g",
+        "m",
+        "b",
+        "y",
+        "v",
+        "d",
+        "h",
+        "u",
+        "p",
+        "j",
+        "ö",
+        "f",
+        "c",
+    ],
+    "Korean": [
+        "이",
+        "다",
+        "에",
+        "의",
+        "는",
+        "로",
+        "하",
+        "을",
+        "가",
+        "고",
+        "지",
+        "서",
+        "한",
+        "은",
+        "기",
+        "으",
+        "년",
+        "대",
+        "사",
+        "시",
+        "를",
+        "리",
+        "도",
+        "인",
+        "스",
+        "일",
+    ],
+    "Indonesian": [
+        "a",
+        "n",
+        "e",
+        "i",
+        "r",
+        "t",
+        "u",
+        "s",
+        "d",
+        "k",
+        "m",
+        "l",
+        "g",
+        "p",
+        "b",
+        "o",
+        "h",
+        "y",
+        "j",
+        "c",
+        "w",
+        "f",
+        "v",
+        "z",
+        "x",
+        "q",
+    ],
+    "Turkish": [
+        "a",
+        "e",
+        "i",
+        "n",
+        "r",
+        "l",
+        "ı",
+        "k",
+        "d",
+        "t",
+        "s",
+        "m",
+        "y",
+        "u",
+        "o",
+        "b",
+        "ü",
+        "ş",
+        "v",
+        "g",
+        "z",
+        "h",
+        "c",
+        "p",
+        "ç",
+        "ğ",
+    ],
+    "Romanian": [
+        "e",
+        "i",
+        "a",
+        "r",
+        "n",
+        "t",
+        "u",
+        "l",
+        "o",
+        "c",
+        "s",
+        "d",
+        "p",
+        "m",
+        "ă",
+        "f",
+        "v",
+        "î",
+        "g",
+        "b",
+        "ș",
+        "ț",
+        "z",
+        "h",
+        "â",
+        "j",
+    ],
+    "Farsi": [
+        "ا",
+        "ی",
+        "ر",
+        "د",
+        "ن",
+        "ه",
+        "و",
+        "م",
+        "ت",
+        "ب",
+        "س",
+        "ل",
+        "ک",
+        "ش",
+        "ز",
+        "ف",
+        "گ",
+        "ع",
+        "خ",
+        "ق",
+        "ج",
+        "آ",
+        "پ",
+        "ح",
+        "ط",
+        "ص",
+    ],
+    "Arabic": [
+        "ا",
+        "ل",
+        "ي",
+        "م",
+        "و",
+        "ن",
+        "ر",
+        "ت",
+        "ب",
+        "ة",
+        "ع",
+        "د",
+        "س",
+        "ف",
+        "ه",
+        "ك",
+        "ق",
+        "أ",
+        "ح",
+        "ج",
+        "ش",
+        "ط",
+        "ص",
+        "ى",
+        "خ",
+        "إ",
+    ],
+    "Danish": [
+        "e",
+        "r",
+        "n",
+        "t",
+        "a",
+        "i",
+        "s",
+        "d",
+        "l",
+        "o",
+        "g",
+        "m",
+        "k",
+        "f",
+        "v",
+        "u",
+        "b",
+        "h",
+        "p",
+        "å",
+        "y",
+        "ø",
+        "æ",
+        "c",
+        "j",
+        "w",
+    ],
+    "Serbian": [
+        "а",
+        "и",
+        "о",
+        "е",
+        "н",
+        "р",
+        "с",
+        "у",
+        "т",
+        "к",
+        "ј",
+        "в",
+        "д",
+        "м",
+        "п",
+        "л",
+        "г",
+        "з",
+        "б",
+        "a",
+        "i",
+        "e",
+        "o",
+        "n",
+        "ц",
+        "ш",
+    ],
+    "Lithuanian": [
+        "i",
+        "a",
+        "s",
+        "o",
+        "r",
+        "e",
+        "t",
+        "n",
+        "u",
+        "k",
+        "m",
+        "l",
+        "p",
+        "v",
+        "d",
+        "j",
+        "g",
+        "ė",
+        "b",
+        "y",
+        "ų",
+        "š",
+        "ž",
+        "c",
+        "ą",
+        "į",
+    ],
+    "Slovene": [
+        "e",
+        "a",
+        "i",
+        "o",
+        "n",
+        "r",
+        "s",
+        "l",
+        "t",
+        "j",
+        "v",
+        "k",
+        "d",
+        "p",
+        "m",
+        "u",
+        "z",
+        "b",
+        "g",
+        "h",
+        "č",
+        "c",
+        "š",
+        "ž",
+        "f",
+        "y",
+    ],
+    "Slovak": [
+        "o",
+        "a",
+        "e",
+        "n",
+        "i",
+        "r",
+        "v",
+        "t",
+        "s",
+        "l",
+        "k",
+        "d",
+        "m",
+        "p",
+        "u",
+        "c",
+        "h",
+        "j",
+        "b",
+        "z",
+        "á",
+        "y",
+        "ý",
+        "í",
+        "č",
+        "é",
+    ],
+    "Hebrew": [
+        "י",
+        "ו",
+        "ה",
+        "ל",
+        "ר",
+        "ב",
+        "ת",
+        "מ",
+        "א",
+        "ש",
+        "נ",
+        "ע",
+        "ם",
+        "ד",
+        "ק",
+        "ח",
+        "פ",
+        "ס",
+        "כ",
+        "ג",
+        "ט",
+        "צ",
+        "ן",
+        "ז",
+        "ך",
+    ],
+    "Bulgarian": [
+        "а",
+        "и",
+        "о",
+        "е",
+        "н",
+        "т",
+        "р",
+        "с",
+        "в",
+        "л",
+        "к",
+        "д",
+        "п",
+        "м",
+        "з",
+        "г",
+        "я",
+        "ъ",
+        "у",
+        "б",
+        "ч",
+        "ц",
+        "й",
+        "ж",
+        "щ",
+        "х",
+    ],
+    "Croatian": [
+        "a",
+        "i",
+        "o",
+        "e",
+        "n",
+        "r",
+        "j",
+        "s",
+        "t",
+        "u",
+        "k",
+        "l",
+        "v",
+        "d",
+        "m",
+        "p",
+        "g",
+        "z",
+        "b",
+        "c",
+        "č",
+        "h",
+        "š",
+        "ž",
+        "ć",
+        "f",
+    ],
+    "Hindi": [
+        "क",
+        "र",
+        "स",
+        "न",
+        "त",
+        "म",
+        "ह",
+        "प",
+        "य",
+        "ल",
+        "व",
+        "ज",
+        "द",
+        "ग",
+        "ब",
+        "श",
+        "ट",
+        "अ",
+        "ए",
+        "थ",
+        "भ",
+        "ड",
+        "च",
+        "ध",
+        "ष",
+        "इ",
+    ],
+    "Estonian": [
+        "a",
+        "i",
+        "e",
+        "s",
+        "t",
+        "l",
+        "u",
+        "n",
+        "o",
+        "k",
+        "r",
+        "d",
+        "m",
+        "v",
+        "g",
+        "p",
+        "j",
+        "h",
+        "ä",
+        "b",
+        "õ",
+        "ü",
+        "f",
+        "c",
+        "ö",
+        "y",
+    ],
+    "Thai": [
+        "า",
+        "น",
+        "ร",
+        "อ",
+        "ก",
+        "เ",
+        "ง",
+        "ม",
+        "ย",
+        "ล",
+        "ว",
+        "ด",
+        "ท",
+        "ส",
+        "ต",
+        "ะ",
+        "ป",
+        "บ",
+        "ค",
+        "ห",
+        "แ",
+        "จ",
+        "พ",
+        "ช",
+        "ข",
+        "ใ",
+    ],
+    "Greek": [
+        "α",
+        "τ",
+        "ο",
+        "ι",
+        "ε",
+        "ν",
+        "ρ",
+        "σ",
+        "κ",
+        "η",
+        "π",
+        "ς",
+        "υ",
+        "μ",
+        "λ",
+        "ί",
+        "ό",
+        "ά",
+        "γ",
+        "έ",
+        "δ",
+        "ή",
+        "ω",
+        "χ",
+        "θ",
+        "ύ",
+    ],
+    "Tamil": [
+        "க",
+        "த",
+        "ப",
+        "ட",
+        "ர",
+        "ம",
+        "ல",
+        "ன",
+        "வ",
+        "ற",
+        "ய",
+        "ள",
+        "ச",
+        "ந",
+        "இ",
+        "ண",
+        "அ",
+        "ஆ",
+        "ழ",
+        "ங",
+        "எ",
+        "உ",
+        "ஒ",
+        "ஸ",
+    ],
+    "Kazakh": [
+        "а",
+        "ы",
+        "е",
+        "н",
+        "т",
+        "р",
+        "л",
+        "і",
+        "д",
+        "с",
+        "м",
+        "қ",
+        "к",
+        "о",
+        "б",
+        "и",
+        "у",
+        "ғ",
+        "ж",
+        "ң",
+        "з",
+        "ш",
+        "й",
+        "п",
+        "г",
+        "ө",
+    ],
+}
+
+LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES)
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/legacy.py b/.venv/lib/python3.11/site-packages/charset_normalizer/legacy.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2f534514120df5ed1170bc298f81250405655db
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/legacy.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+from warnings import warn
+
+from .api import from_bytes
+from .constant import CHARDET_CORRESPONDENCE
+
+# TODO: remove this check when dropping Python 3.7 support
+if TYPE_CHECKING:
+    from typing_extensions import TypedDict
+
+    class ResultDict(TypedDict):
+        encoding: str | None
+        language: str
+        confidence: float | None
+
+
+def detect(
+    byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
+) -> ResultDict:
+    """
+    chardet legacy method
+    Detect the encoding of the given byte string. It should be mostly backward-compatible.
+    Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
+    This function is deprecated and should be used to migrate your project easily, consult the documentation for
+    further information. Not planned for removal.
+
+    :param byte_str:     The byte sequence to examine.
+    :param should_rename_legacy:  Should we rename legacy encodings
+                                  to their more modern equivalents?
+    """
+    if len(kwargs):
+        warn(
+            f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
+        )
+
+    if not isinstance(byte_str, (bytearray, bytes)):
+        raise TypeError(  # pragma: nocover
+            "Expected object of type bytes or bytearray, got: " "{}".format(
+                type(byte_str)
+            )
+        )
+
+    if isinstance(byte_str, bytearray):
+        byte_str = bytes(byte_str)
+
+    r = from_bytes(byte_str).best()
+
+    encoding = r.encoding if r is not None else None
+    language = r.language if r is not None and r.language != "Unknown" else ""
+    confidence = 1.0 - r.chaos if r is not None else None
+
+    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
+    # but chardet does return 'utf-8-sig' and it is a valid codec name.
+    if r is not None and encoding == "utf_8" and r.bom:
+        encoding += "_sig"
+
+    if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
+        encoding = CHARDET_CORRESPONDENCE[encoding]
+
+    return {
+        "encoding": encoding,
+        "language": language,
+        "confidence": confidence,
+    }
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/md.cpython-311-x86_64-linux-gnu.so b/.venv/lib/python3.11/site-packages/charset_normalizer/md.cpython-311-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..3824a428ffd621958e1f1f22dfd105c58417ffd0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/charset_normalizer/md.cpython-311-x86_64-linux-gnu.so differ
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/md.py b/.venv/lib/python3.11/site-packages/charset_normalizer/md.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed59a868dafeebdf07132d79f48fe28e202ec0a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/md.py
@@ -0,0 +1,630 @@
+from __future__ import annotations
+
+from functools import lru_cache
+from logging import getLogger
+
+from .constant import (
+    COMMON_SAFE_ASCII_CHARACTERS,
+    TRACE,
+    UNICODE_SECONDARY_RANGE_KEYWORD,
+)
+from .utils import (
+    is_accentuated,
+    is_arabic,
+    is_arabic_isolated_form,
+    is_case_variable,
+    is_cjk,
+    is_emoticon,
+    is_hangul,
+    is_hiragana,
+    is_katakana,
+    is_latin,
+    is_punctuation,
+    is_separator,
+    is_symbol,
+    is_thai,
+    is_unprintable,
+    remove_accent,
+    unicode_range,
+)
+
+
+class MessDetectorPlugin:
+    """
+    Base abstract class used for mess detection plugins.
+    All detectors MUST extend and implement given methods.
+    """
+
+    def eligible(self, character: str) -> bool:
+        """
+        Determine if given character should be fed in.
+        """
+        raise NotImplementedError  # pragma: nocover
+
+    def feed(self, character: str) -> None:
+        """
+        The main routine to be executed upon character.
+        Insert the logic in witch the text would be considered chaotic.
+        """
+        raise NotImplementedError  # pragma: nocover
+
+    def reset(self) -> None:  # pragma: no cover
+        """
+        Permit to reset the plugin to the initial state.
+        """
+        raise NotImplementedError
+
+    @property
+    def ratio(self) -> float:
+        """
+        Compute the chaos ratio based on what your feed() has seen.
+        Must NOT be lower than 0.; No restriction gt 0.
+        """
+        raise NotImplementedError  # pragma: nocover
+
+
+class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._punctuation_count: int = 0
+        self._symbol_count: int = 0
+        self._character_count: int = 0
+
+        self._last_printable_char: str | None = None
+        self._frenzy_symbol_in_word: bool = False
+
+    def eligible(self, character: str) -> bool:
+        return character.isprintable()
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+
+        if (
+            character != self._last_printable_char
+            and character not in COMMON_SAFE_ASCII_CHARACTERS
+        ):
+            if is_punctuation(character):
+                self._punctuation_count += 1
+            elif (
+                character.isdigit() is False
+                and is_symbol(character)
+                and is_emoticon(character) is False
+            ):
+                self._symbol_count += 2
+
+        self._last_printable_char = character
+
+    def reset(self) -> None:  # Abstract
+        self._punctuation_count = 0
+        self._character_count = 0
+        self._symbol_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:
+            return 0.0
+
+        ratio_of_punctuation: float = (
+            self._punctuation_count + self._symbol_count
+        ) / self._character_count
+
+        return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
+
+
+class TooManyAccentuatedPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._accentuated_count: int = 0
+
+    def eligible(self, character: str) -> bool:
+        return character.isalpha()
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+
+        if is_accentuated(character):
+            self._accentuated_count += 1
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._accentuated_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        ratio_of_accentuation: float = self._accentuated_count / self._character_count
+        return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
+
+
+class UnprintablePlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._unprintable_count: int = 0
+        self._character_count: int = 0
+
+    def eligible(self, character: str) -> bool:
+        return True
+
+    def feed(self, character: str) -> None:
+        if is_unprintable(character):
+            self._unprintable_count += 1
+        self._character_count += 1
+
+    def reset(self) -> None:  # Abstract
+        self._unprintable_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:
+            return 0.0
+
+        return (self._unprintable_count * 8) / self._character_count
+
+
+class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._successive_count: int = 0
+        self._character_count: int = 0
+
+        self._last_latin_character: str | None = None
+
+    def eligible(self, character: str) -> bool:
+        return character.isalpha() and is_latin(character)
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+        if (
+            self._last_latin_character is not None
+            and is_accentuated(character)
+            and is_accentuated(self._last_latin_character)
+        ):
+            if character.isupper() and self._last_latin_character.isupper():
+                self._successive_count += 1
+            # Worse if its the same char duplicated with different accent.
+            if remove_accent(character) == remove_accent(self._last_latin_character):
+                self._successive_count += 1
+        self._last_latin_character = character
+
+    def reset(self) -> None:  # Abstract
+        self._successive_count = 0
+        self._character_count = 0
+        self._last_latin_character = None
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:
+            return 0.0
+
+        return (self._successive_count * 2) / self._character_count
+
+
+class SuspiciousRange(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._suspicious_successive_range_count: int = 0
+        self._character_count: int = 0
+        self._last_printable_seen: str | None = None
+
+    def eligible(self, character: str) -> bool:
+        return character.isprintable()
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+
+        if (
+            character.isspace()
+            or is_punctuation(character)
+            or character in COMMON_SAFE_ASCII_CHARACTERS
+        ):
+            self._last_printable_seen = None
+            return
+
+        if self._last_printable_seen is None:
+            self._last_printable_seen = character
+            return
+
+        unicode_range_a: str | None = unicode_range(self._last_printable_seen)
+        unicode_range_b: str | None = unicode_range(character)
+
+        if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
+            self._suspicious_successive_range_count += 1
+
+        self._last_printable_seen = character
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._suspicious_successive_range_count = 0
+        self._last_printable_seen = None
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count <= 13:
+            return 0.0
+
+        ratio_of_suspicious_range_usage: float = (
+            self._suspicious_successive_range_count * 2
+        ) / self._character_count
+
+        return ratio_of_suspicious_range_usage
+
+
+class SuperWeirdWordPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._word_count: int = 0
+        self._bad_word_count: int = 0
+        self._foreign_long_count: int = 0
+
+        self._is_current_word_bad: bool = False
+        self._foreign_long_watch: bool = False
+
+        self._character_count: int = 0
+        self._bad_character_count: int = 0
+
+        self._buffer: str = ""
+        self._buffer_accent_count: int = 0
+        self._buffer_glyph_count: int = 0
+
+    def eligible(self, character: str) -> bool:
+        return True
+
+    def feed(self, character: str) -> None:
+        if character.isalpha():
+            self._buffer += character
+            if is_accentuated(character):
+                self._buffer_accent_count += 1
+            if (
+                self._foreign_long_watch is False
+                and (is_latin(character) is False or is_accentuated(character))
+                and is_cjk(character) is False
+                and is_hangul(character) is False
+                and is_katakana(character) is False
+                and is_hiragana(character) is False
+                and is_thai(character) is False
+            ):
+                self._foreign_long_watch = True
+            if (
+                is_cjk(character)
+                or is_hangul(character)
+                or is_katakana(character)
+                or is_hiragana(character)
+                or is_thai(character)
+            ):
+                self._buffer_glyph_count += 1
+            return
+        if not self._buffer:
+            return
+        if (
+            character.isspace() or is_punctuation(character) or is_separator(character)
+        ) and self._buffer:
+            self._word_count += 1
+            buffer_length: int = len(self._buffer)
+
+            self._character_count += buffer_length
+
+            if buffer_length >= 4:
+                if self._buffer_accent_count / buffer_length >= 0.5:
+                    self._is_current_word_bad = True
+                # Word/Buffer ending with an upper case accentuated letter are so rare,
+                # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
+                elif (
+                    is_accentuated(self._buffer[-1])
+                    and self._buffer[-1].isupper()
+                    and all(_.isupper() for _ in self._buffer) is False
+                ):
+                    self._foreign_long_count += 1
+                    self._is_current_word_bad = True
+                elif self._buffer_glyph_count == 1:
+                    self._is_current_word_bad = True
+                    self._foreign_long_count += 1
+            if buffer_length >= 24 and self._foreign_long_watch:
+                camel_case_dst = [
+                    i
+                    for c, i in zip(self._buffer, range(0, buffer_length))
+                    if c.isupper()
+                ]
+                probable_camel_cased: bool = False
+
+                if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
+                    probable_camel_cased = True
+
+                if not probable_camel_cased:
+                    self._foreign_long_count += 1
+                    self._is_current_word_bad = True
+
+            if self._is_current_word_bad:
+                self._bad_word_count += 1
+                self._bad_character_count += len(self._buffer)
+                self._is_current_word_bad = False
+
+            self._foreign_long_watch = False
+            self._buffer = ""
+            self._buffer_accent_count = 0
+            self._buffer_glyph_count = 0
+        elif (
+            character not in {"<", ">", "-", "=", "~", "|", "_"}
+            and character.isdigit() is False
+            and is_symbol(character)
+        ):
+            self._is_current_word_bad = True
+            self._buffer += character
+
+    def reset(self) -> None:  # Abstract
+        self._buffer = ""
+        self._is_current_word_bad = False
+        self._foreign_long_watch = False
+        self._bad_word_count = 0
+        self._word_count = 0
+        self._character_count = 0
+        self._bad_character_count = 0
+        self._foreign_long_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._word_count <= 10 and self._foreign_long_count == 0:
+            return 0.0
+
+        return self._bad_character_count / self._character_count
+
+
+class CjkInvalidStopPlugin(MessDetectorPlugin):
+    """
+    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
+    can be easily detected. Searching for the overuse of '丅' and '丄'.
+    """
+
+    def __init__(self) -> None:
+        self._wrong_stop_count: int = 0
+        self._cjk_character_count: int = 0
+
+    def eligible(self, character: str) -> bool:
+        return True
+
+    def feed(self, character: str) -> None:
+        if character in {"丅", "丄"}:
+            self._wrong_stop_count += 1
+            return
+        if is_cjk(character):
+            self._cjk_character_count += 1
+
+    def reset(self) -> None:  # Abstract
+        self._wrong_stop_count = 0
+        self._cjk_character_count = 0
+
+    @property
+    def ratio(self) -> float:
+        if self._cjk_character_count < 16:
+            return 0.0
+        return self._wrong_stop_count / self._cjk_character_count
+
+
+class ArchaicUpperLowerPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._buf: bool = False
+
+        self._character_count_since_last_sep: int = 0
+
+        self._successive_upper_lower_count: int = 0
+        self._successive_upper_lower_count_final: int = 0
+
+        self._character_count: int = 0
+
+        self._last_alpha_seen: str | None = None
+        self._current_ascii_only: bool = True
+
+    def eligible(self, character: str) -> bool:
+        return True
+
+    def feed(self, character: str) -> None:
+        is_concerned = character.isalpha() and is_case_variable(character)
+        chunk_sep = is_concerned is False
+
+        if chunk_sep and self._character_count_since_last_sep > 0:
+            if (
+                self._character_count_since_last_sep <= 64
+                and character.isdigit() is False
+                and self._current_ascii_only is False
+            ):
+                self._successive_upper_lower_count_final += (
+                    self._successive_upper_lower_count
+                )
+
+            self._successive_upper_lower_count = 0
+            self._character_count_since_last_sep = 0
+            self._last_alpha_seen = None
+            self._buf = False
+            self._character_count += 1
+            self._current_ascii_only = True
+
+            return
+
+        if self._current_ascii_only is True and character.isascii() is False:
+            self._current_ascii_only = False
+
+        if self._last_alpha_seen is not None:
+            if (character.isupper() and self._last_alpha_seen.islower()) or (
+                character.islower() and self._last_alpha_seen.isupper()
+            ):
+                if self._buf is True:
+                    self._successive_upper_lower_count += 2
+                    self._buf = False
+                else:
+                    self._buf = True
+            else:
+                self._buf = False
+
+        self._character_count += 1
+        self._character_count_since_last_sep += 1
+        self._last_alpha_seen = character
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._character_count_since_last_sep = 0
+        self._successive_upper_lower_count = 0
+        self._successive_upper_lower_count_final = 0
+        self._last_alpha_seen = None
+        self._buf = False
+        self._current_ascii_only = True
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count == 0:
+            return 0.0
+
+        return self._successive_upper_lower_count_final / self._character_count
+
+
+class ArabicIsolatedFormPlugin(MessDetectorPlugin):
+    def __init__(self) -> None:
+        self._character_count: int = 0
+        self._isolated_form_count: int = 0
+
+    def reset(self) -> None:  # Abstract
+        self._character_count = 0
+        self._isolated_form_count = 0
+
+    def eligible(self, character: str) -> bool:
+        return is_arabic(character)
+
+    def feed(self, character: str) -> None:
+        self._character_count += 1
+
+        if is_arabic_isolated_form(character):
+            self._isolated_form_count += 1
+
+    @property
+    def ratio(self) -> float:
+        if self._character_count < 8:
+            return 0.0
+
+        isolated_form_usage: float = self._isolated_form_count / self._character_count
+
+        return isolated_form_usage
+
+
+@lru_cache(maxsize=1024)
+def is_suspiciously_successive_range(
+    unicode_range_a: str | None, unicode_range_b: str | None
+) -> bool:
+    """
+    Determine if two Unicode range seen next to each other can be considered as suspicious.
+    """
+    if unicode_range_a is None or unicode_range_b is None:
+        return True
+
+    if unicode_range_a == unicode_range_b:
+        return False
+
+    if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
+        return False
+
+    if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
+        return False
+
+    # Latin characters can be accompanied with a combining diacritical mark
+    # eg. Vietnamese.
+    if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
+        "Combining" in unicode_range_a or "Combining" in unicode_range_b
+    ):
+        return False
+
+    keywords_range_a, keywords_range_b = (
+        unicode_range_a.split(" "),
+        unicode_range_b.split(" "),
+    )
+
+    for el in keywords_range_a:
+        if el in UNICODE_SECONDARY_RANGE_KEYWORD:
+            continue
+        if el in keywords_range_b:
+            return False
+
+    # Japanese Exception
+    range_a_jp_chars, range_b_jp_chars = (
+        unicode_range_a
+        in (
+            "Hiragana",
+            "Katakana",
+        ),
+        unicode_range_b in ("Hiragana", "Katakana"),
+    )
+    if (range_a_jp_chars or range_b_jp_chars) and (
+        "CJK" in unicode_range_a or "CJK" in unicode_range_b
+    ):
+        return False
+    if range_a_jp_chars and range_b_jp_chars:
+        return False
+
+    if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
+        if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
+            return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False
+
+    # Chinese/Japanese use dedicated range for punctuation and/or separators.
+    if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
+        unicode_range_a in ["Katakana", "Hiragana"]
+        and unicode_range_b in ["Katakana", "Hiragana"]
+    ):
+        if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
+            return False
+        if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
+            return False
+        if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
+            return False
+
+    return True
+
+
+@lru_cache(maxsize=2048)
+def mess_ratio(
+    decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
+) -> float:
+    """
+    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
+    """
+
+    detectors: list[MessDetectorPlugin] = [
+        md_class() for md_class in MessDetectorPlugin.__subclasses__()
+    ]
+
+    length: int = len(decoded_sequence) + 1
+
+    mean_mess_ratio: float = 0.0
+
+    if length < 512:
+        intermediary_mean_mess_ratio_calc: int = 32
+    elif length <= 1024:
+        intermediary_mean_mess_ratio_calc = 64
+    else:
+        intermediary_mean_mess_ratio_calc = 128
+
+    for character, index in zip(decoded_sequence + "\n", range(length)):
+        for detector in detectors:
+            if detector.eligible(character):
+                detector.feed(character)
+
+        if (
+            index > 0 and index % intermediary_mean_mess_ratio_calc == 0
+        ) or index == length - 1:
+            mean_mess_ratio = sum(dt.ratio for dt in detectors)
+
+            if mean_mess_ratio >= maximum_threshold:
+                break
+
+    if debug:
+        logger = getLogger("charset_normalizer")
+
+        logger.log(
+            TRACE,
+            "Mess-detector extended-analysis start. "
+            f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
+            f"maximum_threshold={maximum_threshold}",
+        )
+
+        if len(decoded_sequence) > 16:
+            logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
+            logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
+
+        for dt in detectors:
+            logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
+
+    return round(mean_mess_ratio, 3)
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/models.py b/.venv/lib/python3.11/site-packages/charset_normalizer/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..1042758f873d2a54f7078c5411b17ffc11dca4ee
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/models.py
@@ -0,0 +1,360 @@
+from __future__ import annotations
+
+from encodings.aliases import aliases
+from hashlib import sha256
+from json import dumps
+from re import sub
+from typing import Any, Iterator, List, Tuple
+
+from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
+from .utils import iana_name, is_multi_byte_encoding, unicode_range
+
+
+class CharsetMatch:
+    def __init__(
+        self,
+        payload: bytes,
+        guessed_encoding: str,
+        mean_mess_ratio: float,
+        has_sig_or_bom: bool,
+        languages: CoherenceMatches,
+        decoded_payload: str | None = None,
+        preemptive_declaration: str | None = None,
+    ):
+        self._payload: bytes = payload
+
+        self._encoding: str = guessed_encoding
+        self._mean_mess_ratio: float = mean_mess_ratio
+        self._languages: CoherenceMatches = languages
+        self._has_sig_or_bom: bool = has_sig_or_bom
+        self._unicode_ranges: list[str] | None = None
+
+        self._leaves: list[CharsetMatch] = []
+        self._mean_coherence_ratio: float = 0.0
+
+        self._output_payload: bytes | None = None
+        self._output_encoding: str | None = None
+
+        self._string: str | None = decoded_payload
+
+        self._preemptive_declaration: str | None = preemptive_declaration
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, CharsetMatch):
+            if isinstance(other, str):
+                return iana_name(other) == self.encoding
+            return False
+        return self.encoding == other.encoding and self.fingerprint == other.fingerprint
+
+    def __lt__(self, other: object) -> bool:
+        """
+        Implemented to make sorted available upon CharsetMatches items.
+        """
+        if not isinstance(other, CharsetMatch):
+            raise ValueError
+
+        chaos_difference: float = abs(self.chaos - other.chaos)
+        coherence_difference: float = abs(self.coherence - other.coherence)
+
+        # Below 1% difference --> Use Coherence
+        if chaos_difference < 0.01 and coherence_difference > 0.02:
+            return self.coherence > other.coherence
+        elif chaos_difference < 0.01 and coherence_difference <= 0.02:
+            # When having a difficult decision, use the result that decoded as many multi-byte as possible.
+            # preserve RAM usage!
+            if len(self._payload) >= TOO_BIG_SEQUENCE:
+                return self.chaos < other.chaos
+            return self.multi_byte_usage > other.multi_byte_usage
+
+        return self.chaos < other.chaos
+
+    @property
+    def multi_byte_usage(self) -> float:
+        return 1.0 - (len(str(self)) / len(self.raw))
+
+    def __str__(self) -> str:
+        # Lazy Str Loading
+        if self._string is None:
+            self._string = str(self._payload, self._encoding, "strict")
+        return self._string
+
+    def __repr__(self) -> str:
+        return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
+
+    def add_submatch(self, other: CharsetMatch) -> None:
+        if not isinstance(other, CharsetMatch) or other == self:
+            raise ValueError(
+                "Unable to add instance <{}> as a submatch of a CharsetMatch".format(
+                    other.__class__
+                )
+            )
+
+        other._string = None  # Unload RAM usage; dirty trick.
+        self._leaves.append(other)
+
+    @property
+    def encoding(self) -> str:
+        return self._encoding
+
+    @property
+    def encoding_aliases(self) -> list[str]:
+        """
+        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
+        """
+        also_known_as: list[str] = []
+        for u, p in aliases.items():
+            if self.encoding == u:
+                also_known_as.append(p)
+            elif self.encoding == p:
+                also_known_as.append(u)
+        return also_known_as
+
+    @property
+    def bom(self) -> bool:
+        return self._has_sig_or_bom
+
+    @property
+    def byte_order_mark(self) -> bool:
+        return self._has_sig_or_bom
+
+    @property
+    def languages(self) -> list[str]:
+        """
+        Return the complete list of possible languages found in decoded sequence.
+        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
+        """
+        return [e[0] for e in self._languages]
+
+    @property
+    def language(self) -> str:
+        """
+        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
+        "Unknown".
+        """
+        if not self._languages:
+            # Trying to infer the language based on the given encoding
+            # Its either English or we should not pronounce ourselves in certain cases.
+            if "ascii" in self.could_be_from_charset:
+                return "English"
+
+            # doing it there to avoid circular import
+            from charset_normalizer.cd import encoding_languages, mb_encoding_languages
+
+            languages = (
+                mb_encoding_languages(self.encoding)
+                if is_multi_byte_encoding(self.encoding)
+                else encoding_languages(self.encoding)
+            )
+
+            if len(languages) == 0 or "Latin Based" in languages:
+                return "Unknown"
+
+            return languages[0]
+
+        return self._languages[0][0]
+
+    @property
+    def chaos(self) -> float:
+        return self._mean_mess_ratio
+
+    @property
+    def coherence(self) -> float:
+        if not self._languages:
+            return 0.0
+        return self._languages[0][1]
+
+    @property
+    def percent_chaos(self) -> float:
+        return round(self.chaos * 100, ndigits=3)
+
+    @property
+    def percent_coherence(self) -> float:
+        return round(self.coherence * 100, ndigits=3)
+
+    @property
+    def raw(self) -> bytes:
+        """
+        Original untouched bytes.
+        """
+        return self._payload
+
+    @property
+    def submatch(self) -> list[CharsetMatch]:
+        return self._leaves
+
+    @property
+    def has_submatch(self) -> bool:
+        return len(self._leaves) > 0
+
+    @property
+    def alphabets(self) -> list[str]:
+        if self._unicode_ranges is not None:
+            return self._unicode_ranges
+        # list detected ranges
+        detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
+        # filter and sort
+        self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
+        return self._unicode_ranges
+
+    @property
+    def could_be_from_charset(self) -> list[str]:
+        """
+        The complete list of encoding that output the exact SAME str result and therefore could be the originating
+        encoding.
+        This list does include the encoding available in property 'encoding'.
+        """
+        return [self._encoding] + [m.encoding for m in self._leaves]
+
+    def output(self, encoding: str = "utf_8") -> bytes:
+        """
+        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
+        Any errors will be simply ignored by the encoder NOT replaced.
+        """
+        if self._output_encoding is None or self._output_encoding != encoding:
+            self._output_encoding = encoding
+            decoded_string = str(self)
+            if (
+                self._preemptive_declaration is not None
+                and self._preemptive_declaration.lower()
+                not in ["utf-8", "utf8", "utf_8"]
+            ):
+                patched_header = sub(
+                    RE_POSSIBLE_ENCODING_INDICATION,
+                    lambda m: m.string[m.span()[0] : m.span()[1]].replace(
+                        m.groups()[0],
+                        iana_name(self._output_encoding).replace("_", "-"),  # type: ignore[arg-type]
+                    ),
+                    decoded_string[:8192],
+                    count=1,
+                )
+
+                decoded_string = patched_header + decoded_string[8192:]
+
+            self._output_payload = decoded_string.encode(encoding, "replace")
+
+        return self._output_payload  # type: ignore
+
+    @property
+    def fingerprint(self) -> str:
+        """
+        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
+        """
+        return sha256(self.output()).hexdigest()
+
+
+class CharsetMatches:
+    """
+    Container with every CharsetMatch items ordered by default from most probable to the less one.
+    Act like a list(iterable) but does not implements all related methods.
+    """
+
+    def __init__(self, results: list[CharsetMatch] | None = None):
+        self._results: list[CharsetMatch] = sorted(results) if results else []
+
+    def __iter__(self) -> Iterator[CharsetMatch]:
+        yield from self._results
+
+    def __getitem__(self, item: int | str) -> CharsetMatch:
+        """
+        Retrieve a single item either by its position or encoding name (alias may be used here).
+        Raise KeyError upon invalid index or encoding not present in results.
+        """
+        if isinstance(item, int):
+            return self._results[item]
+        if isinstance(item, str):
+            item = iana_name(item, False)
+            for result in self._results:
+                if item in result.could_be_from_charset:
+                    return result
+        raise KeyError
+
+    def __len__(self) -> int:
+        return len(self._results)
+
+    def __bool__(self) -> bool:
+        return len(self._results) > 0
+
+    def append(self, item: CharsetMatch) -> None:
+        """
+        Insert a single match. Will be inserted accordingly to preserve sort.
+        Can be inserted as a submatch.
+        """
+        if not isinstance(item, CharsetMatch):
+            raise ValueError(
+                "Cannot append instance '{}' to CharsetMatches".format(
+                    str(item.__class__)
+                )
+            )
+        # We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
+        if len(item.raw) < TOO_BIG_SEQUENCE:
+            for match in self._results:
+                if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
+                    match.add_submatch(item)
+                    return
+        self._results.append(item)
+        self._results = sorted(self._results)
+
+    def best(self) -> CharsetMatch | None:
+        """
+        Simply return the first match. Strict equivalent to matches[0].
+        """
+        if not self._results:
+            return None
+        return self._results[0]
+
+    def first(self) -> CharsetMatch | None:
+        """
+        Redundant method, call the method best(). Kept for BC reasons.
+        """
+        return self.best()
+
+
+CoherenceMatch = Tuple[str, float]
+CoherenceMatches = List[CoherenceMatch]
+
+
+class CliDetectionResult:
+    def __init__(
+        self,
+        path: str,
+        encoding: str | None,
+        encoding_aliases: list[str],
+        alternative_encodings: list[str],
+        language: str,
+        alphabets: list[str],
+        has_sig_or_bom: bool,
+        chaos: float,
+        coherence: float,
+        unicode_path: str | None,
+        is_preferred: bool,
+    ):
+        self.path: str = path
+        self.unicode_path: str | None = unicode_path
+        self.encoding: str | None = encoding
+        self.encoding_aliases: list[str] = encoding_aliases
+        self.alternative_encodings: list[str] = alternative_encodings
+        self.language: str = language
+        self.alphabets: list[str] = alphabets
+        self.has_sig_or_bom: bool = has_sig_or_bom
+        self.chaos: float = chaos
+        self.coherence: float = coherence
+        self.is_preferred: bool = is_preferred
+
+    @property
+    def __dict__(self) -> dict[str, Any]:  # type: ignore
+        return {
+            "path": self.path,
+            "encoding": self.encoding,
+            "encoding_aliases": self.encoding_aliases,
+            "alternative_encodings": self.alternative_encodings,
+            "language": self.language,
+            "alphabets": self.alphabets,
+            "has_sig_or_bom": self.has_sig_or_bom,
+            "chaos": self.chaos,
+            "coherence": self.coherence,
+            "unicode_path": self.unicode_path,
+            "is_preferred": self.is_preferred,
+        }
+
+    def to_json(self) -> str:
+        return dumps(self.__dict__, ensure_ascii=True, indent=4)
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/py.typed b/.venv/lib/python3.11/site-packages/charset_normalizer/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/utils.py b/.venv/lib/python3.11/site-packages/charset_normalizer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0175e0a96aa70652d357525649d0404e7e02fc00
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/utils.py
@@ -0,0 +1,408 @@
+from __future__ import annotations
+
+import importlib
+import logging
+import unicodedata
+from codecs import IncrementalDecoder
+from encodings.aliases import aliases
+from functools import lru_cache
+from re import findall
+from typing import Generator
+
+from _multibytecodec import (  # type: ignore[import-not-found,import]
+    MultibyteIncrementalDecoder,
+)
+
+from .constant import (
+    ENCODING_MARKS,
+    IANA_SUPPORTED_SIMILAR,
+    RE_POSSIBLE_ENCODING_INDICATION,
+    UNICODE_RANGES_COMBINED,
+    UNICODE_SECONDARY_RANGE_KEYWORD,
+    UTF8_MAXIMAL_ALLOCATION,
+)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_accentuated(character: str) -> bool:
+    try:
+        description: str = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+    return (
+        "WITH GRAVE" in description
+        or "WITH ACUTE" in description
+        or "WITH CEDILLA" in description
+        or "WITH DIAERESIS" in description
+        or "WITH CIRCUMFLEX" in description
+        or "WITH TILDE" in description
+        or "WITH MACRON" in description
+        or "WITH RING ABOVE" in description
+    )
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def remove_accent(character: str) -> str:
+    decomposed: str = unicodedata.decomposition(character)
+    if not decomposed:
+        return character
+
+    codes: list[str] = decomposed.split(" ")
+
+    return chr(int(codes[0], 16))
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def unicode_range(character: str) -> str | None:
+    """
+    Retrieve the Unicode range official name from a single character.
+    """
+    character_ord: int = ord(character)
+
+    for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
+        if character_ord in ord_range:
+            return range_name
+
+    return None
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_latin(character: str) -> bool:
+    try:
+        description: str = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+    return "LATIN" in description
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_punctuation(character: str) -> bool:
+    character_category: str = unicodedata.category(character)
+
+    if "P" in character_category:
+        return True
+
+    character_range: str | None = unicode_range(character)
+
+    if character_range is None:
+        return False
+
+    return "Punctuation" in character_range
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_symbol(character: str) -> bool:
+    character_category: str = unicodedata.category(character)
+
+    if "S" in character_category or "N" in character_category:
+        return True
+
+    character_range: str | None = unicode_range(character)
+
+    if character_range is None:
+        return False
+
+    return "Forms" in character_range and character_category != "Lo"
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_emoticon(character: str) -> bool:
+    character_range: str | None = unicode_range(character)
+
+    if character_range is None:
+        return False
+
+    return "Emoticons" in character_range or "Pictographs" in character_range
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_separator(character: str) -> bool:
+    if character.isspace() or character in {"｜", "+", "<", ">"}:
+        return True
+
+    character_category: str = unicodedata.category(character)
+
+    return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_case_variable(character: str) -> bool:
+    return character.islower() != character.isupper()
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_cjk(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "CJK" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hiragana(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "HIRAGANA" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_katakana(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "KATAKANA" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_hangul(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "HANGUL" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_thai(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "THAI" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "ARABIC" in character_name
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_arabic_isolated_form(character: str) -> bool:
+    try:
+        character_name = unicodedata.name(character)
+    except ValueError:  # Defensive: unicode database outdated?
+        return False
+
+    return "ARABIC" in character_name and "ISOLATED FORM" in character_name
+
+
+@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
+def is_unicode_range_secondary(range_name: str) -> bool:
+    return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
+
+
+@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
+def is_unprintable(character: str) -> bool:
+    return (
+        character.isspace() is False  # includes \n \t \r \v
+        and character.isprintable() is False
+        and character != "\x1a"  # Why? Its the ASCII substitute character.
+        and character != "\ufeff"  # bug discovered in Python,
+        # Zero Width No-Break Space located in 	Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
+    )
+
+
+def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
+    """
+    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
+    """
+    if not isinstance(sequence, bytes):
+        raise TypeError
+
+    seq_len: int = len(sequence)
+
+    results: list[str] = findall(
+        RE_POSSIBLE_ENCODING_INDICATION,
+        sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
+    )
+
+    if len(results) == 0:
+        return None
+
+    for specified_encoding in results:
+        specified_encoding = specified_encoding.lower().replace("-", "_")
+
+        encoding_alias: str
+        encoding_iana: str
+
+        for encoding_alias, encoding_iana in aliases.items():
+            if encoding_alias == specified_encoding:
+                return encoding_iana
+            if encoding_iana == specified_encoding:
+                return encoding_iana
+
+    return None
+
+
+@lru_cache(maxsize=128)
+def is_multi_byte_encoding(name: str) -> bool:
+    """
+    Verify is a specific encoding is a multi byte one based on it IANA name
+    """
+    return name in {
+        "utf_8",
+        "utf_8_sig",
+        "utf_16",
+        "utf_16_be",
+        "utf_16_le",
+        "utf_32",
+        "utf_32_le",
+        "utf_32_be",
+        "utf_7",
+    } or issubclass(
+        importlib.import_module(f"encodings.{name}").IncrementalDecoder,
+        MultibyteIncrementalDecoder,
+    )
+
+
+def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
+    """
+    Identify and extract SIG/BOM in given sequence.
+    """
+
+    for iana_encoding in ENCODING_MARKS:
+        marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
+
+        if isinstance(marks, bytes):
+            marks = [marks]
+
+        for mark in marks:
+            if sequence.startswith(mark):
+                return iana_encoding, mark
+
+    return None, b""
+
+
+def should_strip_sig_or_bom(iana_encoding: str) -> bool:
+    return iana_encoding not in {"utf_16", "utf_32"}
+
+
+def iana_name(cp_name: str, strict: bool = True) -> str:
+    """Returns the Python normalized encoding name (Not the IANA official name)."""
+    cp_name = cp_name.lower().replace("-", "_")
+
+    encoding_alias: str
+    encoding_iana: str
+
+    for encoding_alias, encoding_iana in aliases.items():
+        if cp_name in [encoding_alias, encoding_iana]:
+            return encoding_iana
+
+    if strict:
+        raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
+
+    return cp_name
+
+
+def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
+    if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
+        return 0.0
+
+    decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
+    decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
+
+    id_a: IncrementalDecoder = decoder_a(errors="ignore")
+    id_b: IncrementalDecoder = decoder_b(errors="ignore")
+
+    character_match_count: int = 0
+
+    for i in range(255):
+        to_be_decoded: bytes = bytes([i])
+        if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
+            character_match_count += 1
+
+    return character_match_count / 254
+
+
+def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
+    """
+    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
+    the function cp_similarity.
+    """
+    return (
+        iana_name_a in IANA_SUPPORTED_SIMILAR
+        and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
+    )
+
+
+def set_logging_handler(
+    name: str = "charset_normalizer",
+    level: int = logging.INFO,
+    format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
+) -> None:
+    logger = logging.getLogger(name)
+    logger.setLevel(level)
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(logging.Formatter(format_string))
+    logger.addHandler(handler)
+
+
+def cut_sequence_chunks(
+    sequences: bytes,
+    encoding_iana: str,
+    offsets: range,
+    chunk_size: int,
+    bom_or_sig_available: bool,
+    strip_sig_or_bom: bool,
+    sig_payload: bytes,
+    is_multi_byte_decoder: bool,
+    decoded_payload: str | None = None,
+) -> Generator[str, None, None]:
+    if decoded_payload and is_multi_byte_decoder is False:
+        for i in offsets:
+            chunk = decoded_payload[i : i + chunk_size]
+            if not chunk:
+                break
+            yield chunk
+    else:
+        for i in offsets:
+            chunk_end = i + chunk_size
+            if chunk_end > len(sequences) + 8:
+                continue
+
+            cut_sequence = sequences[i : i + chunk_size]
+
+            if bom_or_sig_available and strip_sig_or_bom is False:
+                cut_sequence = sig_payload + cut_sequence
+
+            chunk = cut_sequence.decode(
+                encoding_iana,
+                errors="ignore" if is_multi_byte_decoder else "strict",
+            )
+
+            # multi-byte bad cutting detector and adjustment
+            # not the cleanest way to perform that fix but clever enough for now.
+            if is_multi_byte_decoder and i > 0:
+                chunk_partial_size_chk: int = min(chunk_size, 16)
+
+                if (
+                    decoded_payload
+                    and chunk[:chunk_partial_size_chk] not in decoded_payload
+                ):
+                    for j in range(i, i - 4, -1):
+                        cut_sequence = sequences[j:chunk_end]
+
+                        if bom_or_sig_available and strip_sig_or_bom is False:
+                            cut_sequence = sig_payload + cut_sequence
+
+                        chunk = cut_sequence.decode(encoding_iana, errors="ignore")
+
+                        if chunk[:chunk_partial_size_chk] in decoded_payload:
+                            break
+
+            yield chunk
diff --git a/.venv/lib/python3.11/site-packages/charset_normalizer/version.py b/.venv/lib/python3.11/site-packages/charset_normalizer/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..f85e8929e74cad7b3c0bf95bbc8ac3625b4db1b2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/charset_normalizer/version.py
@@ -0,0 +1,8 @@
+"""
+Expose version
+"""
+
+from __future__ import annotations
+
+__version__ = "3.4.1"
+VERSION = __version__.split(".")
diff --git a/.venv/lib/python3.11/site-packages/cv2/cv2.abi3.so b/.venv/lib/python3.11/site-packages/cv2/cv2.abi3.so
new file mode 100644
index 0000000000000000000000000000000000000000..a4f1aeaa2b5f12b5083af00923540702b1ac83d4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/cv2/cv2.abi3.so
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:68fee49d266a95e730c1cb17d913a39a93ab5c50bee1581600f453026f9c7b8d
+size 66106617
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..63ba2e0dee4637a286e2aba7a68b6e7193b017f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/audio_classification.py
@@ -0,0 +1,44 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Literal, Optional
+
+from .base import BaseInferenceType
+
+
+AudioClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass
+class AudioClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Audio Classification"""
+
+    function_to_apply: Optional["AudioClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass
+class AudioClassificationInput(BaseInferenceType):
+    """Inputs for Audio Classification inference"""
+
+    inputs: str
+    """The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the audio data as a raw bytes payload.
+    """
+    parameters: Optional[AudioClassificationParameters] = None
+    """Additional inference parameters for Audio Classification"""
+
+
+@dataclass
+class AudioClassificationOutputElement(BaseInferenceType):
+    """Outputs for Audio Classification inference"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c4a0f4e878bbccc689e5d5ca644ad1871b0022e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/document_question_answering.py
@@ -0,0 +1,81 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, List, Optional, Union
+
+from .base import BaseInferenceType
+
+
+@dataclass
+class DocumentQuestionAnsweringInputData(BaseInferenceType):
+    """One (document, question) pair to answer"""
+
+    image: Any
+    """The image on which the question is asked"""
+    question: str
+    """A question to ask of the document"""
+
+
+@dataclass
+class DocumentQuestionAnsweringParameters(BaseInferenceType):
+    """Additional inference parameters for Document Question Answering"""
+
+    doc_stride: Optional[int] = None
+    """If the words in the document are too long to fit with the question for the model, it will
+    be split in several chunks with some overlap. This argument controls the size of that
+    overlap.
+    """
+    handle_impossible_answer: Optional[bool] = None
+    """Whether to accept impossible as an answer"""
+    lang: Optional[str] = None
+    """Language to use while running OCR. Defaults to english."""
+    max_answer_len: Optional[int] = None
+    """The maximum length of predicted answers (e.g., only answers with a shorter length are
+    considered).
+    """
+    max_question_len: Optional[int] = None
+    """The maximum length of the question after tokenization. It will be truncated if needed."""
+    max_seq_len: Optional[int] = None
+    """The maximum length of the total sentence (context + question) in tokens of each chunk
+    passed to the model. The context will be split in several chunks (using doc_stride as
+    overlap) if needed.
+    """
+    top_k: Optional[int] = None
+    """The number of answers to return (will be chosen by order of likelihood). Can return less
+    than top_k answers if there are not enough options available within the context.
+    """
+    word_boxes: Optional[List[Union[List[float], str]]] = None
+    """A list of words and bounding boxes (normalized 0->1000). If provided, the inference will
+    skip the OCR step and use the provided bounding boxes instead.
+    """
+
+
+@dataclass
+class DocumentQuestionAnsweringInput(BaseInferenceType):
+    """Inputs for Document Question Answering inference"""
+
+    inputs: DocumentQuestionAnsweringInputData
+    """One (document, question) pair to answer"""
+    parameters: Optional[DocumentQuestionAnsweringParameters] = None
+    """Additional inference parameters for Document Question Answering"""
+
+
+@dataclass
+class DocumentQuestionAnsweringOutputElement(BaseInferenceType):
+    """Outputs of inference for the Document Question Answering task"""
+
+    answer: str
+    """The answer to the question."""
+    end: int
+    """The end word index of the answer (in the OCR’d version of the input or provided word
+    boxes).
+    """
+    score: float
+    """The probability associated to the answer."""
+    start: int
+    """The start word index of the answer (in the OCR’d version of the input or provided word
+    boxes).
+    """
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c43e82cc691698b8c2df9ecf642917bc4b1dec0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/feature_extraction.py
@@ -0,0 +1,37 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Literal, Optional
+
+from .base import BaseInferenceType
+
+
+FeatureExtractionInputTruncationDirection = Literal["Left", "Right"]
+
+
+@dataclass
+class FeatureExtractionInput(BaseInferenceType):
+    """Feature Extraction Input.
+    Auto-generated from TEI specs.
+    For more details, check out
+    https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
+    """
+
+    inputs: str
+    """The text to embed."""
+    normalize: Optional[bool] = None
+    prompt_name: Optional[str] = None
+    """The name of the prompt that should be used by for encoding. If not set, no prompt
+    will be applied.
+    Must be a key in the `sentence-transformers` configuration `prompts` dictionary.
+    For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
+    ...},
+    then the sentence "What is the capital of France?" will be encoded as
+    "query: What is the capital of France?" because the prompt text will be prepended before
+    any text to encode.
+    """
+    truncate: Optional[bool] = None
+    truncation_direction: Optional["FeatureExtractionInputTruncationDirection"] = None
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/image_classification.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff76c74aab616802652bf9a499b73217dbae16d8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/image_classification.py
@@ -0,0 +1,44 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Literal, Optional
+
+from .base import BaseInferenceType
+
+
+ImageClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass
+class ImageClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Image Classification"""
+
+    function_to_apply: Optional["ImageClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass
+class ImageClassificationInput(BaseInferenceType):
+    """Inputs for Image Classification inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
+    parameters: Optional[ImageClassificationParameters] = None
+    """Additional inference parameters for Image Classification"""
+
+
+@dataclass
+class ImageClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Image Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3cd884d6ad6642ac937fbbc31802236d0f607f4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/sentence_similarity.py
@@ -0,0 +1,28 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+
+from .base import BaseInferenceType
+
+
+@dataclass
+class SentenceSimilarityInputData(BaseInferenceType):
+    sentences: List[str]
+    """A list of strings which will be compared against the source_sentence."""
+    source_sentence: str
+    """The string that you wish to compare the other strings with. This can be a phrase,
+    sentence, or longer passage, depending on the model being used.
+    """
+
+
+@dataclass
+class SentenceSimilarityInput(BaseInferenceType):
+    """Inputs for Sentence similarity inference"""
+
+    inputs: SentenceSimilarityInputData
+    parameters: Optional[Dict[str, Any]] = None
+    """Additional inference parameters for Sentence Similarity"""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cbb1a9266b074ff47a21de53ca6c1e7091412db
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/text_to_video.py
@@ -0,0 +1,47 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, List, Optional
+
+from .base import BaseInferenceType
+
+
+@dataclass
+class TextToVideoParameters(BaseInferenceType):
+    """Additional inference parameters for Text To Video"""
+
+    guidance_scale: Optional[float] = None
+    """A higher guidance scale value encourages the model to generate videos closely linked to
+    the text prompt, but values too high may cause saturation and other artifacts.
+    """
+    negative_prompt: Optional[List[str]] = None
+    """One or several prompt to guide what NOT to include in video generation."""
+    num_frames: Optional[float] = None
+    """The num_frames parameter determines how many video frames are generated."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    video at the expense of slower inference.
+    """
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+
+
+@dataclass
+class TextToVideoInput(BaseInferenceType):
+    """Inputs for Text To Video inference"""
+
+    inputs: str
+    """The input text data (sometimes called "prompt")"""
+    parameters: Optional[TextToVideoParameters] = None
+    """Additional inference parameters for Text To Video"""
+
+
+@dataclass
+class TextToVideoOutput(BaseInferenceType):
+    """Outputs of inference for the Text To Video task"""
+
+    video: Any
+    """The generated video returned as raw bytes in the payload."""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/translation.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/translation.py
new file mode 100644
index 0000000000000000000000000000000000000000..898559c4f3052e62f853ba2d654467cf3a53efbc
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/translation.py
@@ -0,0 +1,50 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, Dict, Literal, Optional
+
+from .base import BaseInferenceType
+
+
+TranslationTruncationStrategy = Literal["do_not_truncate", "longest_first", "only_first", "only_second"]
+
+
+@dataclass
+class TranslationParameters(BaseInferenceType):
+    """Additional inference parameters for Translation"""
+
+    clean_up_tokenization_spaces: Optional[bool] = None
+    """Whether to clean up the potential extra spaces in the text output."""
+    generate_parameters: Optional[Dict[str, Any]] = None
+    """Additional parametrization of the text generation algorithm."""
+    src_lang: Optional[str] = None
+    """The source language of the text. Required for models that can translate from multiple
+    languages.
+    """
+    tgt_lang: Optional[str] = None
+    """Target language to translate to. Required for models that can translate to multiple
+    languages.
+    """
+    truncation: Optional["TranslationTruncationStrategy"] = None
+    """The truncation strategy to use."""
+
+
+@dataclass
+class TranslationInput(BaseInferenceType):
+    """Inputs for Translation inference"""
+
+    inputs: str
+    """The text to translate."""
+    parameters: Optional[TranslationParameters] = None
+    """Additional inference parameters for Translation"""
+
+
+@dataclass
+class TranslationOutput(BaseInferenceType):
+    """Outputs of inference for the Translation task"""
+
+    translation_text: str
+    """The translated text."""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/video_classification.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/video_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32312edf45941d4a22a94fba9896a53d3007031
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/video_classification.py
@@ -0,0 +1,46 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, Literal, Optional
+
+from .base import BaseInferenceType
+
+
+VideoClassificationOutputTransform = Literal["sigmoid", "softmax", "none"]
+
+
+@dataclass
+class VideoClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Video Classification"""
+
+    frame_sampling_rate: Optional[int] = None
+    """The sampling rate used to select frames from the video."""
+    function_to_apply: Optional["VideoClassificationOutputTransform"] = None
+    """The function to apply to the model outputs in order to retrieve the scores."""
+    num_frames: Optional[int] = None
+    """The number of sampled frames to consider for classification."""
+    top_k: Optional[int] = None
+    """When specified, limits the output to the top K most probable classes."""
+
+
+@dataclass
+class VideoClassificationInput(BaseInferenceType):
+    """Inputs for Video Classification inference"""
+
+    inputs: Any
+    """The input video data"""
+    parameters: Optional[VideoClassificationParameters] = None
+    """Additional inference parameters for Video Classification"""
+
+
+@dataclass
+class VideoClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Video Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..9001b3bd17a7cebfd329b12dc21ff79e980c41fb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/visual_question_answering.py
@@ -0,0 +1,50 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import Any, Optional
+
+from .base import BaseInferenceType
+
+
+@dataclass
+class VisualQuestionAnsweringInputData(BaseInferenceType):
+    """One (image, question) pair to answer"""
+
+    image: Any
+    """The image."""
+    question: str
+    """The question to answer based on the image."""
+
+
+@dataclass
+class VisualQuestionAnsweringParameters(BaseInferenceType):
+    """Additional inference parameters for Visual Question Answering"""
+
+    top_k: Optional[int] = None
+    """The number of answers to return (will be chosen by order of likelihood). Note that we
+    return less than topk answers if there are not enough options available within the
+    context.
+    """
+
+
+@dataclass
+class VisualQuestionAnsweringInput(BaseInferenceType):
+    """Inputs for Visual Question Answering inference"""
+
+    inputs: VisualQuestionAnsweringInputData
+    """One (image, question) pair to answer"""
+    parameters: Optional[VisualQuestionAnsweringParameters] = None
+    """Additional inference parameters for Visual Question Answering"""
+
+
+@dataclass
+class VisualQuestionAnsweringOutputElement(BaseInferenceType):
+    """Outputs of inference for the Visual Question Answering task"""
+
+    score: float
+    """The associated score / probability"""
+    answer: Optional[str] = None
+    """The answer to the question"""
diff --git a/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e340e8c9e98ba208308ae8be780dee41d0ba72c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/huggingface_hub/inference/_generated/types/zero_shot_image_classification.py
@@ -0,0 +1,41 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from dataclasses import dataclass
+from typing import List, Optional
+
+from .base import BaseInferenceType
+
+
+@dataclass
+class ZeroShotImageClassificationParameters(BaseInferenceType):
+    """Additional inference parameters for Zero Shot Image Classification"""
+
+    candidate_labels: List[str]
+    """The candidate labels for this image"""
+    hypothesis_template: Optional[str] = None
+    """The sentence used in conjunction with `candidate_labels` to attempt the image
+    classification by replacing the placeholder with the candidate labels.
+    """
+
+
+@dataclass
+class ZeroShotImageClassificationInput(BaseInferenceType):
+    """Inputs for Zero Shot Image Classification inference"""
+
+    inputs: str
+    """The input image data to classify as a base64-encoded string."""
+    parameters: ZeroShotImageClassificationParameters
+    """Additional inference parameters for Zero Shot Image Classification"""
+
+
+@dataclass
+class ZeroShotImageClassificationOutputElement(BaseInferenceType):
+    """Outputs of inference for the Zero Shot Image Classification task"""
+
+    label: str
+    """The predicted class label."""
+    score: float
+    """The corresponding probability."""
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8dbbc06834fa169830e174018fd1afe5a2add7a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7ea50da7beb2187e77f7606dd70faed0e4b4add
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openacc/cupti_openacc.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+
+#if !defined(_CUPTI_OPENACC_H_)
+#define _CUPTI_OPENACC_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OpenACC support
+ *
+ * \param profRegister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profUnregister function of type acc_prof_reg as obtained from acc_register_library
+ * \param profLookup function of type acc_prof_lookup as obtained from acc_register_library
+ */
+CUptiResult CUPTIAPI
+cuptiOpenACCInitialize(void *profRegister, void *profUnregister, void *profLookup);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENACC_H_*/
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h
new file mode 100644
index 0000000000000000000000000000000000000000..303dd42878fb02774d872c197ccc27b17f2af69e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/cupti_openmp.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#include <cuda_stdint.h>
+#include "Openmp/omp-tools.h"
+
+#if !defined(_CUPTI_OPENMP_H_)
+#define _CUPTI_OPENMP_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize OPENMP support (deprecated, used before OpenMP 5.0)
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize(ompt_function_lookup_t ompt_fn_lookup, const char *runtime_version, unsigned int ompt_version);
+
+/**
+ * \brief Initialize OPENMP support
+ *
+ */
+int CUPTIAPI cuptiOpenMpInitialize_v2(ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_OPENMP_H_*/
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h
new file mode 100644
index 0000000000000000000000000000000000000000..276967d07e8f8c0f7686e5b3b15151edf2415ae7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/Openmp/omp-tools.h
@@ -0,0 +1,1083 @@
+/*
+ * include/50/omp-tools.h.var
+ */
+
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.txt for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __OMPT__
+#define __OMPT__
+
+/*****************************************************************************
+ * system include files
+ *****************************************************************************/
+
+#include <stdint.h>
+#include <stddef.h>
+
+/*****************************************************************************
+ * iteration macros
+ *****************************************************************************/
+
+#define FOREACH_OMPT_INQUIRY_FN(macro)      \
+    macro (ompt_enumerate_states)           \
+    macro (ompt_enumerate_mutex_impls)      \
+                                            \
+    macro (ompt_set_callback)               \
+    macro (ompt_get_callback)               \
+                                            \
+    macro (ompt_get_state)                  \
+                                            \
+    macro (ompt_get_parallel_info)          \
+    macro (ompt_get_task_info)              \
+    macro (ompt_get_task_memory)            \
+    macro (ompt_get_thread_data)            \
+    macro (ompt_get_unique_id)              \
+    macro (ompt_finalize_tool)              \
+                                            \
+    macro(ompt_get_num_procs)               \
+    macro(ompt_get_num_places)              \
+    macro(ompt_get_place_proc_ids)          \
+    macro(ompt_get_place_num)               \
+    macro(ompt_get_partition_place_nums)    \
+    macro(ompt_get_proc_id)                 \
+                                            \
+    macro(ompt_get_target_info)             \
+    macro(ompt_get_num_devices)
+
+#define FOREACH_OMPT_STATE(macro)                                                                \
+                                                                                                \
+    /* first available state */                                                                 \
+    macro (ompt_state_undefined, 0x102)      /* undefined thread state */                        \
+                                                                                                \
+    /* work states (0..15) */                                                                   \
+    macro (ompt_state_work_serial, 0x000)    /* working outside parallel */                      \
+    macro (ompt_state_work_parallel, 0x001)  /* working within parallel */                       \
+    macro (ompt_state_work_reduction, 0x002) /* performing a reduction */                        \
+                                                                                                \
+    /* barrier wait states (16..31) */                                                          \
+    macro (ompt_state_wait_barrier, 0x010)   /* waiting at a barrier */                          \
+    macro (ompt_state_wait_barrier_implicit_parallel, 0x011)                                     \
+                                            /* implicit barrier at the end of parallel region */\
+    macro (ompt_state_wait_barrier_implicit_workshare, 0x012)                                    \
+                                            /* implicit barrier at the end of worksharing */    \
+    macro (ompt_state_wait_barrier_implicit, 0x013)  /* implicit barrier */                      \
+    macro (ompt_state_wait_barrier_explicit, 0x014)  /* explicit barrier */                      \
+                                                                                                \
+    /* task wait states (32..63) */                                                             \
+    macro (ompt_state_wait_taskwait, 0x020)  /* waiting at a taskwait */                         \
+    macro (ompt_state_wait_taskgroup, 0x021) /* waiting at a taskgroup */                        \
+                                                                                                \
+    /* mutex wait states (64..127) */                                                           \
+    macro (ompt_state_wait_mutex, 0x040)                                                         \
+    macro (ompt_state_wait_lock, 0x041)      /* waiting for lock */                              \
+    macro (ompt_state_wait_critical, 0x042)  /* waiting for critical */                          \
+    macro (ompt_state_wait_atomic, 0x043)    /* waiting for atomic */                            \
+    macro (ompt_state_wait_ordered, 0x044)   /* waiting for ordered */                           \
+                                                                                                \
+    /* target wait states (128..255) */                                                         \
+    macro (ompt_state_wait_target, 0x080)        /* waiting for target region */                 \
+    macro (ompt_state_wait_target_map, 0x081)    /* waiting for target data mapping operation */ \
+    macro (ompt_state_wait_target_update, 0x082) /* waiting for target update operation */       \
+                                                                                                \
+    /* misc (256..511) */                                                                       \
+    macro (ompt_state_idle, 0x100)           /* waiting for work */                              \
+    macro (ompt_state_overhead, 0x101)       /* overhead excluding wait states */                \
+                                                                                                \
+    /* implementation-specific states (512..) */
+
+
+#define FOREACH_KMP_MUTEX_IMPL(macro)                                                \
+    macro (kmp_mutex_impl_none, 0)         /* unknown implementation */              \
+    macro (kmp_mutex_impl_spin, 1)         /* based on spin */                       \
+    macro (kmp_mutex_impl_queuing, 2)      /* based on some fair policy */           \
+    macro (kmp_mutex_impl_speculative, 3)  /* based on HW-supported speculation */
+
+#define FOREACH_OMPT_EVENT(macro)                                                                                        \
+                                                                                                                         \
+    /*--- Mandatory Events ---*/                                                                                         \
+    macro (ompt_callback_thread_begin,      ompt_callback_thread_begin_t,       1) /* thread begin                    */ \
+    macro (ompt_callback_thread_end,        ompt_callback_thread_end_t,         2) /* thread end                      */ \
+                                                                                                                         \
+    macro (ompt_callback_parallel_begin,    ompt_callback_parallel_begin_t,     3) /* parallel begin                  */ \
+    macro (ompt_callback_parallel_end,      ompt_callback_parallel_end_t,       4) /* parallel end                    */ \
+                                                                                                                         \
+    macro (ompt_callback_task_create,       ompt_callback_task_create_t,        5) /* task begin                      */ \
+    macro (ompt_callback_task_schedule,     ompt_callback_task_schedule_t,      6) /* task schedule                   */ \
+    macro (ompt_callback_implicit_task,     ompt_callback_implicit_task_t,      7) /* implicit task                   */ \
+                                                                                                                         \
+    macro (ompt_callback_target,            ompt_callback_target_t,             8) /* target                          */ \
+    macro (ompt_callback_target_data_op,    ompt_callback_target_data_op_t,     9) /* target data op                  */ \
+    macro (ompt_callback_target_submit,     ompt_callback_target_submit_t,     10) /* target  submit                  */ \
+                                                                                                                         \
+    macro (ompt_callback_control_tool,      ompt_callback_control_tool_t,      11) /* control tool                    */ \
+                                                                                                                         \
+    macro (ompt_callback_device_initialize, ompt_callback_device_initialize_t, 12) /* device initialize               */ \
+    macro (ompt_callback_device_finalize,   ompt_callback_device_finalize_t,   13) /* device finalize                 */ \
+                                                                                                                         \
+    macro (ompt_callback_device_load,       ompt_callback_device_load_t,       14) /* device load                     */ \
+    macro (ompt_callback_device_unload,     ompt_callback_device_unload_t,     15) /* device unload                   */ \
+                                                                                                                         \
+    /* Optional Events */                                                                                                \
+    macro (ompt_callback_sync_region_wait,  ompt_callback_sync_region_t,       16) /* sync region wait begin or end   */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_released,    ompt_callback_mutex_t,             17) /* mutex released                  */ \
+                                                                                                                         \
+    macro (ompt_callback_dependences,       ompt_callback_dependences_t,       18) /* report task dependences         */ \
+    macro (ompt_callback_task_dependence,   ompt_callback_task_dependence_t,   19) /* report task dependence          */ \
+                                                                                                                         \
+    macro (ompt_callback_work,              ompt_callback_work_t,              20) /* task at work begin or end       */ \
+                                                                                                                         \
+    macro (ompt_callback_master,            ompt_callback_master_t,            21) /* task at master begin or end     */ \
+                                                                                                                         \
+    macro (ompt_callback_target_map,        ompt_callback_target_map_t,        22) /* target map                      */ \
+                                                                                                                         \
+    macro (ompt_callback_sync_region,       ompt_callback_sync_region_t,       23) /* sync region begin or end        */ \
+                                                                                                                         \
+    macro (ompt_callback_lock_init,         ompt_callback_mutex_acquire_t,     24) /* lock init                       */ \
+    macro (ompt_callback_lock_destroy,      ompt_callback_mutex_t,             25) /* lock destroy                    */ \
+                                                                                                                         \
+    macro (ompt_callback_mutex_acquire,     ompt_callback_mutex_acquire_t,     26) /* mutex acquire                   */ \
+    macro (ompt_callback_mutex_acquired,    ompt_callback_mutex_t,             27) /* mutex acquired                  */ \
+                                                                                                                         \
+    macro (ompt_callback_nest_lock,         ompt_callback_nest_lock_t,         28) /* nest lock                       */ \
+                                                                                                                         \
+    macro (ompt_callback_flush,             ompt_callback_flush_t,             29) /* after executing flush           */ \
+                                                                                                                         \
+    macro (ompt_callback_cancel,            ompt_callback_cancel_t,            30) /* cancel innermost binding region */ \
+                                                                                                                         \
+    macro (ompt_callback_reduction,         ompt_callback_sync_region_t,       31) /* reduction                       */ \
+                                                                                                                         \
+    macro (ompt_callback_dispatch,          ompt_callback_dispatch_t,          32) /* dispatch of work                */
+
+/*****************************************************************************
+ * implementation specific types
+ *****************************************************************************/
+
+typedef enum kmp_mutex_impl_t {
+#define kmp_mutex_impl_macro(impl, code) impl = code,
+    FOREACH_KMP_MUTEX_IMPL(kmp_mutex_impl_macro)
+#undef kmp_mutex_impl_macro
+} kmp_mutex_impl_t;
+
+/*****************************************************************************
+ * definitions generated from spec
+ *****************************************************************************/
+
+typedef enum ompt_callbacks_t {
+  ompt_callback_thread_begin             = 1,
+  ompt_callback_thread_end               = 2,
+  ompt_callback_parallel_begin           = 3,
+  ompt_callback_parallel_end             = 4,
+  ompt_callback_task_create              = 5,
+  ompt_callback_task_schedule            = 6,
+  ompt_callback_implicit_task            = 7,
+  ompt_callback_target                   = 8,
+  ompt_callback_target_data_op           = 9,
+  ompt_callback_target_submit            = 10,
+  ompt_callback_control_tool             = 11,
+  ompt_callback_device_initialize        = 12,
+  ompt_callback_device_finalize          = 13,
+  ompt_callback_device_load              = 14,
+  ompt_callback_device_unload            = 15,
+  ompt_callback_sync_region_wait         = 16,
+  ompt_callback_mutex_released           = 17,
+  ompt_callback_dependences              = 18,
+  ompt_callback_task_dependence          = 19,
+  ompt_callback_work                     = 20,
+  ompt_callback_master                   = 21,
+  ompt_callback_target_map               = 22,
+  ompt_callback_sync_region              = 23,
+  ompt_callback_lock_init                = 24,
+  ompt_callback_lock_destroy             = 25,
+  ompt_callback_mutex_acquire            = 26,
+  ompt_callback_mutex_acquired           = 27,
+  ompt_callback_nest_lock                = 28,
+  ompt_callback_flush                    = 29,
+  ompt_callback_cancel                   = 30,
+  ompt_callback_reduction                = 31,
+  ompt_callback_dispatch                 = 32
+} ompt_callbacks_t;
+
+typedef enum ompt_record_t {
+  ompt_record_ompt               = 1,
+  ompt_record_native             = 2,
+  ompt_record_invalid            = 3
+} ompt_record_t;
+
+typedef enum ompt_record_native_t {
+  ompt_record_native_info  = 1,
+  ompt_record_native_event = 2
+} ompt_record_native_t;
+
+typedef enum ompt_set_result_t {
+  ompt_set_error            = 0,
+  ompt_set_never            = 1,
+  ompt_set_impossible       = 2,
+  ompt_set_sometimes        = 3,
+  ompt_set_sometimes_paired = 4,
+  ompt_set_always           = 5
+} ompt_set_result_t;
+
+typedef uint64_t ompt_id_t;
+
+typedef uint64_t ompt_device_time_t;
+
+typedef uint64_t ompt_buffer_cursor_t;
+
+typedef enum ompt_thread_t {
+  ompt_thread_initial                 = 1,
+  ompt_thread_worker                  = 2,
+  ompt_thread_other                   = 3,
+  ompt_thread_unknown                 = 4
+} ompt_thread_t;
+
+typedef enum ompt_scope_endpoint_t {
+  ompt_scope_begin                    = 1,
+  ompt_scope_end                      = 2
+} ompt_scope_endpoint_t;
+
+typedef enum ompt_dispatch_t {
+  ompt_dispatch_iteration             = 1,
+  ompt_dispatch_section               = 2
+} ompt_dispatch_t;
+
+typedef enum ompt_sync_region_t {
+  ompt_sync_region_barrier                = 1,
+  ompt_sync_region_barrier_implicit       = 2,
+  ompt_sync_region_barrier_explicit       = 3,
+  ompt_sync_region_barrier_implementation = 4,
+  ompt_sync_region_taskwait               = 5,
+  ompt_sync_region_taskgroup              = 6,
+  ompt_sync_region_reduction              = 7
+} ompt_sync_region_t;
+
+typedef enum ompt_target_data_op_t {
+  ompt_target_data_alloc                = 1,
+  ompt_target_data_transfer_to_device   = 2,
+  ompt_target_data_transfer_from_device = 3,
+  ompt_target_data_delete               = 4,
+  ompt_target_data_associate            = 5,
+  ompt_target_data_disassociate         = 6
+} ompt_target_data_op_t;
+
+typedef enum ompt_work_t {
+  ompt_work_loop               = 1,
+  ompt_work_sections           = 2,
+  ompt_work_single_executor    = 3,
+  ompt_work_single_other       = 4,
+  ompt_work_workshare          = 5,
+  ompt_work_distribute         = 6,
+  ompt_work_taskloop           = 7
+} ompt_work_t;
+
+typedef enum ompt_mutex_t {
+  ompt_mutex_lock                     = 1,
+  ompt_mutex_test_lock                = 2,
+  ompt_mutex_nest_lock                = 3,
+  ompt_mutex_test_nest_lock           = 4,
+  ompt_mutex_critical                 = 5,
+  ompt_mutex_atomic                   = 6,
+  ompt_mutex_ordered                  = 7
+} ompt_mutex_t;
+
+typedef enum ompt_native_mon_flag_t {
+  ompt_native_data_motion_explicit    = 0x01,
+  ompt_native_data_motion_implicit    = 0x02,
+  ompt_native_kernel_invocation       = 0x04,
+  ompt_native_kernel_execution        = 0x08,
+  ompt_native_driver                  = 0x10,
+  ompt_native_runtime                 = 0x20,
+  ompt_native_overhead                = 0x40,
+  ompt_native_idleness                = 0x80
+} ompt_native_mon_flag_t;
+
+typedef enum ompt_task_flag_t {
+  ompt_task_initial                   = 0x00000001,
+  ompt_task_implicit                  = 0x00000002,
+  ompt_task_explicit                  = 0x00000004,
+  ompt_task_target                    = 0x00000008,
+  ompt_task_undeferred                = 0x08000000,
+  ompt_task_untied                    = 0x10000000,
+  ompt_task_final                     = 0x20000000,
+  ompt_task_mergeable                 = 0x40000000,
+  ompt_task_merged                    = 0x80000000
+} ompt_task_flag_t;
+
+typedef enum ompt_task_status_t {
+  ompt_task_complete      = 1,
+  ompt_task_yield         = 2,
+  ompt_task_cancel        = 3,
+  ompt_task_detach        = 4,
+  ompt_task_early_fulfill = 5,
+  ompt_task_late_fulfill  = 6,
+  ompt_task_switch        = 7
+} ompt_task_status_t;
+
+typedef enum ompt_target_t {
+  ompt_target                         = 1,
+  ompt_target_enter_data              = 2,
+  ompt_target_exit_data               = 3,
+  ompt_target_update                  = 4
+} ompt_target_t;
+
+typedef enum ompt_parallel_flag_t {
+  ompt_parallel_invoker_program = 0x00000001,
+  ompt_parallel_invoker_runtime = 0x00000002,
+  ompt_parallel_league          = 0x40000000,
+  ompt_parallel_team            = 0x80000000
+} ompt_parallel_flag_t;
+
+typedef enum ompt_target_map_flag_t {
+  ompt_target_map_flag_to             = 0x01,
+  ompt_target_map_flag_from           = 0x02,
+  ompt_target_map_flag_alloc          = 0x04,
+  ompt_target_map_flag_release        = 0x08,
+  ompt_target_map_flag_delete         = 0x10,
+  ompt_target_map_flag_implicit       = 0x20
+} ompt_target_map_flag_t;
+
+typedef enum ompt_dependence_type_t {
+  ompt_dependence_type_in              = 1,
+  ompt_dependence_type_out             = 2,
+  ompt_dependence_type_inout           = 3,
+  ompt_dependence_type_mutexinoutset   = 4,
+  ompt_dependence_type_source          = 5,
+  ompt_dependence_type_sink            = 6
+} ompt_dependence_type_t;
+
+typedef enum ompt_cancel_flag_t {
+  ompt_cancel_parallel       = 0x01,
+  ompt_cancel_sections       = 0x02,
+  ompt_cancel_loop           = 0x04,
+  ompt_cancel_taskgroup      = 0x08,
+  ompt_cancel_activated      = 0x10,
+  ompt_cancel_detected       = 0x20,
+  ompt_cancel_discarded_task = 0x40
+} ompt_cancel_flag_t;
+
+typedef uint64_t ompt_hwid_t;
+
+typedef uint64_t ompt_wait_id_t;
+
+typedef enum ompt_frame_flag_t {
+  ompt_frame_runtime        = 0x00,
+  ompt_frame_application    = 0x01,
+  ompt_frame_cfa            = 0x10,
+  ompt_frame_framepointer   = 0x20,
+  ompt_frame_stackaddress   = 0x30
+} ompt_frame_flag_t; 
+
+typedef enum ompt_state_t {
+  ompt_state_work_serial                      = 0x000,
+  ompt_state_work_parallel                    = 0x001,
+  ompt_state_work_reduction                   = 0x002,
+
+  ompt_state_wait_barrier                     = 0x010,
+  ompt_state_wait_barrier_implicit_parallel   = 0x011,
+  ompt_state_wait_barrier_implicit_workshare  = 0x012,
+  ompt_state_wait_barrier_implicit            = 0x013,
+  ompt_state_wait_barrier_explicit            = 0x014,
+
+  ompt_state_wait_taskwait                    = 0x020,
+  ompt_state_wait_taskgroup                   = 0x021,
+
+  ompt_state_wait_mutex                       = 0x040,
+  ompt_state_wait_lock                        = 0x041,
+  ompt_state_wait_critical                    = 0x042,
+  ompt_state_wait_atomic                      = 0x043,
+  ompt_state_wait_ordered                     = 0x044,
+
+  ompt_state_wait_target                      = 0x080,
+  ompt_state_wait_target_map                  = 0x081,
+  ompt_state_wait_target_update               = 0x082,
+
+  ompt_state_idle                             = 0x100,
+  ompt_state_overhead                         = 0x101,
+  ompt_state_undefined                        = 0x102
+} ompt_state_t;
+
+typedef uint64_t (*ompt_get_unique_id_t) (void);
+
+typedef uint64_t ompd_size_t;
+
+typedef uint64_t ompd_wait_id_t;
+
+typedef uint64_t ompd_addr_t;
+typedef int64_t  ompd_word_t;
+typedef uint64_t ompd_seg_t;
+
+typedef uint64_t ompd_device_t;
+
+typedef uint64_t ompd_thread_id_t;
+
+typedef enum ompd_scope_t {
+  ompd_scope_global = 1,
+  ompd_scope_address_space = 2,
+  ompd_scope_thread = 3,
+  ompd_scope_parallel = 4,
+  ompd_scope_implicit_task = 5,
+  ompd_scope_task = 6
+} ompd_scope_t;
+
+typedef uint64_t ompd_icv_id_t;
+
+typedef enum ompd_rc_t {
+  ompd_rc_ok = 0,
+  ompd_rc_unavailable = 1,
+  ompd_rc_stale_handle = 2,
+  ompd_rc_bad_input = 3,
+  ompd_rc_error = 4,
+  ompd_rc_unsupported = 5,
+  ompd_rc_needs_state_tracking = 6,
+  ompd_rc_incompatible = 7,
+  ompd_rc_device_read_error = 8,
+  ompd_rc_device_write_error = 9,
+  ompd_rc_nomem = 10,
+} ompd_rc_t;
+
+typedef void (*ompt_interface_fn_t) (void);
+
+typedef ompt_interface_fn_t (*ompt_function_lookup_t) (
+  const char *interface_function_name
+);
+
+typedef union ompt_data_t {
+  uint64_t value;
+  void *ptr;
+} ompt_data_t;
+
+typedef struct ompt_frame_t {
+  ompt_data_t exit_frame;
+  ompt_data_t enter_frame;
+  int exit_frame_flags;
+  int enter_frame_flags;
+} ompt_frame_t;
+
+typedef void (*ompt_callback_t) (void);
+
+typedef void ompt_device_t;
+
+typedef void ompt_buffer_t;
+
+typedef void (*ompt_callback_buffer_request_t) (
+  int device_num,
+  ompt_buffer_t **buffer,
+  size_t *bytes
+);
+
+typedef void (*ompt_callback_buffer_complete_t) (
+  int device_num,
+  ompt_buffer_t *buffer,
+  size_t bytes,
+  ompt_buffer_cursor_t begin,
+  int buffer_owned
+);
+
+typedef void (*ompt_finalize_t) (
+  ompt_data_t *tool_data
+);
+
+typedef int (*ompt_initialize_t) (
+  ompt_function_lookup_t lookup,
+  int initial_device_num,
+  ompt_data_t *tool_data
+);
+
+typedef struct ompt_start_tool_result_t {
+  ompt_initialize_t initialize;
+  ompt_finalize_t finalize;
+  ompt_data_t tool_data;
+} ompt_start_tool_result_t;
+
+typedef struct ompt_record_abstract_t {
+  ompt_record_native_t rclass;
+  const char *type;
+  ompt_device_time_t start_time;
+  ompt_device_time_t end_time;
+  ompt_hwid_t hwid;
+} ompt_record_abstract_t;
+
+typedef struct ompt_dependence_t {
+  ompt_data_t variable;
+  ompt_dependence_type_t dependence_type;
+} ompt_dependence_t;
+
+typedef int (*ompt_enumerate_states_t) (
+  int current_state,
+  int *next_state,
+  const char **next_state_name
+);
+
+typedef int (*ompt_enumerate_mutex_impls_t) (
+  int current_impl,
+  int *next_impl,
+  const char **next_impl_name
+);
+
+typedef ompt_set_result_t (*ompt_set_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t callback
+);
+
+typedef int (*ompt_get_callback_t) (
+  ompt_callbacks_t event,
+  ompt_callback_t *callback
+);
+
+typedef ompt_data_t *(*ompt_get_thread_data_t) (void);
+
+typedef int (*ompt_get_num_procs_t) (void);
+
+typedef int (*ompt_get_num_places_t) (void);
+
+typedef int (*ompt_get_place_proc_ids_t) (
+  int place_num,
+  int ids_size,
+  int *ids
+);
+
+typedef int (*ompt_get_place_num_t) (void);
+
+typedef int (*ompt_get_partition_place_nums_t) (
+  int place_nums_size,
+  int *place_nums
+);
+
+typedef int (*ompt_get_proc_id_t) (void);
+
+typedef int (*ompt_get_state_t) (
+  ompt_wait_id_t *wait_id
+);
+
+typedef int (*ompt_get_parallel_info_t) (
+  int ancestor_level,
+  ompt_data_t **parallel_data,
+  int *team_size
+);
+
+typedef int (*ompt_get_task_info_t) (
+  int ancestor_level,
+  int *flags,
+  ompt_data_t **task_data,
+  ompt_frame_t **task_frame,
+  ompt_data_t **parallel_data,
+  int *thread_num
+);
+
+typedef int (*ompt_get_task_memory_t)(
+  void **addr,
+  size_t *size,
+  int block
+);
+
+typedef int (*ompt_get_target_info_t) (
+  uint64_t *device_num,
+  ompt_id_t *target_id,
+  ompt_id_t *host_op_id
+);
+
+typedef int (*ompt_get_num_devices_t) (void);
+
+typedef void (*ompt_finalize_tool_t) (void);
+
+typedef int (*ompt_get_device_num_procs_t) (
+  ompt_device_t *device
+);
+
+typedef ompt_device_time_t (*ompt_get_device_time_t) (
+  ompt_device_t *device
+);
+
+typedef double (*ompt_translate_time_t) (
+  ompt_device_t *device,
+  ompt_device_time_t time
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_ompt_t) (
+  ompt_device_t *device,
+  unsigned int enable,
+  unsigned int etype
+);
+
+typedef ompt_set_result_t (*ompt_set_trace_native_t) (
+  ompt_device_t *device,
+  int enable,
+  int flags
+);
+
+typedef int (*ompt_start_trace_t) (
+  ompt_device_t *device,
+  ompt_callback_buffer_request_t request,
+  ompt_callback_buffer_complete_t complete
+);
+
+typedef int (*ompt_pause_trace_t) (
+  ompt_device_t *device,
+  int begin_pause
+);
+
+typedef int (*ompt_flush_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_stop_trace_t) (
+  ompt_device_t *device
+);
+
+typedef int (*ompt_advance_buffer_cursor_t) (
+  ompt_device_t *device,
+  ompt_buffer_t *buffer,
+  size_t size,
+  ompt_buffer_cursor_t current,
+  ompt_buffer_cursor_t *next
+);
+
+typedef ompt_record_t (*ompt_get_record_type_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+typedef void *(*ompt_get_record_native_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current,
+  ompt_id_t *host_op_id
+);
+
+typedef ompt_record_abstract_t *
+(*ompt_get_record_abstract_t) (
+  void *native_record
+);
+
+typedef void (*ompt_callback_thread_begin_t) (
+  ompt_thread_t thread_type,
+  ompt_data_t *thread_data
+);
+
+typedef struct ompt_record_thread_begin_t {
+  ompt_thread_t thread_type;
+} ompt_record_thread_begin_t;
+
+typedef void (*ompt_callback_thread_end_t) (
+  ompt_data_t *thread_data
+);
+
+typedef void (*ompt_callback_parallel_begin_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *parallel_data,
+  unsigned int requested_parallelism,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_begin_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t parallel_id;
+  unsigned int requested_parallelism;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_begin_t;
+
+typedef void (*ompt_callback_parallel_end_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *encountering_task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_parallel_end_t {
+  ompt_id_t parallel_id;
+  ompt_id_t encountering_task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_parallel_end_t;
+
+typedef void (*ompt_callback_work_t) (
+  ompt_work_t wstype,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  uint64_t count,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_work_t {
+  ompt_work_t wstype;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  uint64_t count;
+  const void *codeptr_ra;
+} ompt_record_work_t;
+
+typedef void (*ompt_callback_dispatch_t) (
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  ompt_dispatch_t kind,
+  ompt_data_t instance 
+);
+
+typedef struct ompt_record_dispatch_t {
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  ompt_dispatch_t kind;
+  ompt_data_t instance; 
+} ompt_record_dispatch_t;
+
+typedef void (*ompt_callback_task_create_t) (
+  ompt_data_t *encountering_task_data,
+  const ompt_frame_t *encountering_task_frame,
+  ompt_data_t *new_task_data,
+  int flags,
+  int has_dependences,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_task_create_t {
+  ompt_id_t encountering_task_id;
+  ompt_id_t new_task_id;
+  int flags;
+  int has_dependences;
+  const void *codeptr_ra;
+} ompt_record_task_create_t;
+
+typedef void (*ompt_callback_dependences_t) (
+  ompt_data_t *task_data,
+  const ompt_dependence_t *deps,
+  int ndeps
+);
+
+typedef struct ompt_record_dependences_t {
+  ompt_id_t task_id;
+  ompt_dependence_t dep;
+  int ndeps;
+} ompt_record_dependences_t;
+
+typedef void (*ompt_callback_task_dependence_t) (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+);
+
+typedef struct ompt_record_task_dependence_t {
+  ompt_id_t src_task_id;
+  ompt_id_t sink_task_id;
+} ompt_record_task_dependence_t;
+
+typedef void (*ompt_callback_task_schedule_t) (
+  ompt_data_t *prior_task_data,
+  ompt_task_status_t prior_task_status,
+  ompt_data_t *next_task_data
+);
+
+typedef struct ompt_record_task_schedule_t {
+  ompt_id_t prior_task_id;
+  ompt_task_status_t prior_task_status;
+  ompt_id_t next_task_id;
+} ompt_record_task_schedule_t;
+
+typedef void (*ompt_callback_implicit_task_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  unsigned int actual_parallelism,
+  unsigned int index,
+  int flags
+);
+
+typedef struct ompt_record_implicit_task_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  unsigned int actual_parallelism;
+  unsigned int index;
+  int flags;
+} ompt_record_implicit_task_t;
+
+typedef void (*ompt_callback_master_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_master_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_master_t;
+
+typedef void (*ompt_callback_sync_region_t) (
+  ompt_sync_region_t kind,
+  ompt_scope_endpoint_t endpoint,
+  ompt_data_t *parallel_data,
+  ompt_data_t *task_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_sync_region_t {
+  ompt_sync_region_t kind;
+  ompt_scope_endpoint_t endpoint;
+  ompt_id_t parallel_id;
+  ompt_id_t task_id;
+  const void *codeptr_ra;
+} ompt_record_sync_region_t;
+
+typedef void (*ompt_callback_mutex_acquire_t) (
+  ompt_mutex_t kind,
+  unsigned int hint,
+  unsigned int impl,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_acquire_t {
+  ompt_mutex_t kind;
+  unsigned int hint;
+  unsigned int impl;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_acquire_t;
+
+typedef void (*ompt_callback_mutex_t) (
+  ompt_mutex_t kind,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_mutex_t {
+  ompt_mutex_t kind;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_mutex_t;
+
+typedef void (*ompt_callback_nest_lock_t) (
+  ompt_scope_endpoint_t endpoint,
+  ompt_wait_id_t wait_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_nest_lock_t {
+  ompt_scope_endpoint_t endpoint;
+  ompt_wait_id_t wait_id;
+  const void *codeptr_ra;
+} ompt_record_nest_lock_t;
+
+typedef void (*ompt_callback_flush_t) (
+  ompt_data_t *thread_data,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_flush_t {
+  const void *codeptr_ra;
+} ompt_record_flush_t;
+
+typedef void (*ompt_callback_cancel_t) (
+  ompt_data_t *task_data,
+  int flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_cancel_t {
+  ompt_id_t task_id;
+  int flags;
+  const void *codeptr_ra;
+} ompt_record_cancel_t;
+
+typedef void (*ompt_callback_device_initialize_t) (
+  int device_num,
+  const char *type,
+  ompt_device_t *device,
+  ompt_function_lookup_t lookup,
+  const char *documentation
+);
+
+typedef void (*ompt_callback_device_finalize_t) (
+  int device_num
+);
+
+typedef void (*ompt_callback_device_load_t) (
+  int device_num,
+  const char *filename,
+  int64_t offset_in_file,
+  void *vma_in_file,
+  size_t bytes,
+  void *host_addr,
+  void *device_addr,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_device_unload_t) (
+  int device_num,
+  uint64_t module_id
+);
+
+typedef void (*ompt_callback_target_data_op_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  ompt_target_data_op_t optype,
+  void *src_addr,
+  int src_device_num,
+  void *dest_addr,
+  int dest_device_num,
+  size_t bytes,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_data_op_t {
+  ompt_id_t host_op_id;
+  ompt_target_data_op_t optype;
+  void *src_addr;
+  int src_device_num;
+  void *dest_addr;
+  int dest_device_num;
+  size_t bytes;
+  ompt_device_time_t end_time;
+  const void *codeptr_ra;
+} ompt_record_target_data_op_t;
+
+typedef void (*ompt_callback_target_t) (
+  ompt_target_t kind,
+  ompt_scope_endpoint_t endpoint,
+  int device_num,
+  ompt_data_t *task_data,
+  ompt_id_t target_id,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_t {
+  ompt_target_t kind;
+  ompt_scope_endpoint_t endpoint;
+  int device_num;
+  ompt_id_t task_id;
+  ompt_id_t target_id;
+  const void *codeptr_ra;
+} ompt_record_target_t;
+
+typedef void (*ompt_callback_target_map_t) (
+  ompt_id_t target_id,
+  unsigned int nitems,
+  void **host_addr,
+  void **device_addr,
+  size_t *bytes,
+  unsigned int *mapping_flags,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_target_map_t {
+  ompt_id_t target_id;
+  unsigned int nitems;
+  void **host_addr;
+  void **device_addr;
+  size_t *bytes;
+  unsigned int *mapping_flags;
+  const void *codeptr_ra;
+} ompt_record_target_map_t;
+
+typedef void (*ompt_callback_target_submit_t) (
+  ompt_id_t target_id,
+  ompt_id_t host_op_id,
+  unsigned int requested_num_teams
+);
+
+typedef struct ompt_record_target_kernel_t {
+  ompt_id_t host_op_id;
+  unsigned int requested_num_teams;
+  unsigned int granted_num_teams;
+  ompt_device_time_t end_time;
+} ompt_record_target_kernel_t;
+
+typedef int (*ompt_callback_control_tool_t) (
+  uint64_t command,
+  uint64_t modifier,
+  void *arg,
+  const void *codeptr_ra
+);
+
+typedef struct ompt_record_control_tool_t {
+  uint64_t command;
+  uint64_t modifier;
+  const void *codeptr_ra;
+} ompt_record_control_tool_t;
+
+typedef struct ompd_address_t {
+  ompd_seg_t segment;
+  ompd_addr_t address;
+} ompd_address_t;
+
+typedef struct ompd_frame_info_t {
+  ompd_address_t frame_address;
+  ompd_word_t frame_flag;
+} ompd_frame_info_t;
+
+typedef struct _ompd_aspace_handle ompd_address_space_handle_t;
+typedef struct _ompd_thread_handle ompd_thread_handle_t;
+typedef struct _ompd_parallel_handle ompd_parallel_handle_t;
+typedef struct _ompd_task_handle ompd_task_handle_t;
+
+typedef struct _ompd_aspace_cont ompd_address_space_context_t;
+typedef struct _ompd_thread_cont ompd_thread_context_t;
+
+typedef struct ompd_device_type_sizes_t {
+  uint8_t sizeof_char;
+  uint8_t sizeof_short;
+  uint8_t sizeof_int;
+  uint8_t sizeof_long;
+  uint8_t sizeof_long_long;
+  uint8_t sizeof_pointer;
+} ompd_device_type_sizes_t;
+
+typedef struct ompt_record_ompt_t {
+  ompt_callbacks_t type;
+  ompt_device_time_t time;
+  ompt_id_t thread_id;
+  ompt_id_t target_id;
+  union {
+    ompt_record_thread_begin_t thread_begin;
+    ompt_record_parallel_begin_t parallel_begin;
+    ompt_record_parallel_end_t parallel_end;
+    ompt_record_work_t work;
+    ompt_record_dispatch_t dispatch;
+    ompt_record_task_create_t task_create;
+    ompt_record_dependences_t dependences;
+    ompt_record_task_dependence_t task_dependence;
+    ompt_record_task_schedule_t task_schedule;
+    ompt_record_implicit_task_t implicit_task;
+    ompt_record_master_t master;
+    ompt_record_sync_region_t sync_region;
+    ompt_record_mutex_acquire_t mutex_acquire;
+    ompt_record_mutex_t mutex;
+    ompt_record_nest_lock_t nest_lock;
+    ompt_record_flush_t flush;
+    ompt_record_cancel_t cancel;
+    ompt_record_target_t target;
+    ompt_record_target_data_op_t target_data_op;
+    ompt_record_target_map_t target_map;
+    ompt_record_target_kernel_t target_kernel;
+    ompt_record_control_tool_t control_tool;
+  } record;
+} ompt_record_ompt_t;
+
+typedef ompt_record_ompt_t *(*ompt_get_record_ompt_t) (
+  ompt_buffer_t *buffer,
+  ompt_buffer_cursor_t current
+);
+
+#define ompt_id_none 0
+#define ompt_data_none {0}
+#define ompt_time_none 0
+#define ompt_hwid_none 0
+#define ompt_addr_none ~0
+#define ompt_mutex_impl_none 0
+#define ompt_wait_id_none 0
+
+#define ompd_segment_none 0
+
+#endif /* __OMPT__ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d378f2942c3f37d1c4cedffe6dad3a005e0a19d4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a9814410e4b6fb4f07ad9edc8394e956b77dbcd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cuda_stdint.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __cuda_stdint_h__
+#define __cuda_stdint_h__
+
+// Compiler-specific treatment for C99's stdint.h
+//
+// By default, this header will use the standard headers (so it
+// is your responsibility to make sure they are available), except
+// on MSVC before Visual Studio 2010, when they were not provided.
+// To support old MSVC, a few of the commonly-used definitions are
+// provided here.  If more definitions are needed, add them here,
+// or replace these definitions with a complete implementation,
+// such as the ones available from Google, Boost, or MSVC10.  You
+// can prevent the definition of any of these types (in order to
+// use your own) by #defining CU_STDINT_TYPES_ALREADY_DEFINED.
+
+#if !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+// In VS including stdint.h forces the C++ runtime dep - provide an opt-out
+// (CU_STDINT_VS_FORCE_NO_STDINT_H) for users that care (notably static
+// cudart).
+#if defined(_MSC_VER) && ((_MSC_VER < 1600) || defined(CU_STDINT_VS_FORCE_NO_STDINT_H))
+
+// These definitions can be used with MSVC 8 and 9,
+// which don't ship with stdint.h:
+
+typedef unsigned   char   uint8_t;
+
+typedef            short  int16_t;
+typedef unsigned   short uint16_t;
+
+// To keep it consistent with all MSVC build. define those types
+// in the exact same way they are defined with the MSVC headers
+#if defined(_MSC_VER)
+typedef signed     char    int8_t;
+
+typedef            int     int32_t;
+typedef unsigned   int     uint32_t;
+
+typedef long long          int64_t;
+typedef unsigned long long uint64_t;
+#else
+typedef            char    int8_t;
+
+typedef            long   int32_t;
+typedef unsigned   long  uint32_t;
+
+typedef          __int64  int64_t;
+typedef unsigned __int64 uint64_t;
+#endif
+
+#elif defined(__DJGPP__)
+
+// These definitions can be used when compiling
+// C code with DJGPP, which only provides stdint.h
+// when compiling C++ code with TR1 enabled.
+
+typedef               char    int8_t;
+typedef unsigned      char   uint8_t;
+
+typedef               short  int16_t;
+typedef unsigned      short uint16_t;
+
+typedef               long   int32_t;
+typedef unsigned      long  uint32_t;
+
+typedef          long long   int64_t;
+typedef unsigned long long  uint64_t;
+
+#else
+
+// Use standard headers, as specified by C99 and C++ TR1.
+// Known to be provided by:
+// - gcc/glibc, supported by all versions of glibc
+// - djgpp, supported since 2001
+// - MSVC, supported by Visual Studio 2010 and later
+
+#include <stdint.h>
+
+#endif
+
+#endif // !defined(CU_STDINT_TYPES_ALREADY_DEFINED)
+
+
+#endif // file guard
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti.h
new file mode 100644
index 0000000000000000000000000000000000000000..be316531dcfd846bcea8feadf3604437ce2447a1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2010-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_H_)
+#define _CUPTI_H_
+
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifdef NOMINMAX
+#include <windows.h>
+#else
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#endif
+#endif
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_version.h>
+
+/* Activity, callback, event and metric APIs */
+#include <cupti_activity.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+
+/* Runtime, driver, and nvtx function identifiers */
+#include <cupti_driver_cbid.h>
+#include <cupti_runtime_cbid.h>
+#include <cupti_nvtx_cbid.h>
+
+/* To support function parameter structures for obsoleted API. See
+   cuda.h for the actual definition of these structures. */
+typedef unsigned int CUdeviceptr_v1;
+typedef struct CUDA_MEMCPY2D_v1_st { int dummy; } CUDA_MEMCPY2D_v1;
+typedef struct CUDA_MEMCPY3D_v1_st { int dummy; } CUDA_MEMCPY3D_v1;
+typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY_DESCRIPTOR_v1;
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY3D_DESCRIPTOR_v1;
+
+/* Function parameter structures */
+#include <generated_cuda_runtime_api_meta.h>
+#include <generated_cuda_meta.h>
+
+/* The following parameter structures cannot be included unless a
+   header that defines GL_VERSION is included before including them.
+   If these are needed then make sure such a header is included
+   already. */
+#ifdef GL_VERSION
+#include <generated_cuda_gl_interop_meta.h>
+#include <generated_cudaGL_meta.h>
+#endif
+
+//#include <generated_nvtx_meta.h>
+
+/* The following parameter structures cannot be included by default as
+   they are not guaranteed to be available on all systems. Uncomment
+   the includes that are available, or use the include explicitly. */
+#if defined(__linux__)
+//#include <generated_cuda_vdpau_interop_meta.h>
+//#include <generated_cudaVDPAU_meta.h>
+#endif
+
+#ifdef _WIN32
+//#include <generated_cuda_d3d9_interop_meta.h>
+//#include <generated_cuda_d3d10_interop_meta.h>
+//#include <generated_cuda_d3d11_interop_meta.h>
+//#include <generated_cudaD3D9_meta.h>
+//#include <generated_cudaD3D10_meta.h>
+//#include <generated_cudaD3D11_meta.h>
+#endif
+
+#endif /*_CUPTI_H_*/
+
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e796b06ce016bc4d69c9c52fbe9b48b13ea0d60
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity.h
@@ -0,0 +1,7545 @@
+/*
+ * Copyright 2011-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_H_)
+#define _CUPTI_ACTIVITY_H_
+
+/**
+ * Deprecated APIs and structures have been moved to the
+ * header :doc: `cupti_activity_deprecated.h`, which is included at
+ * the bottom of this file. Header cupti_activity.h contains
+ * only the latest version of APIs and structures.
+ */
+
+#include <cuda.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+#include <cupti_result.h>
+
+#if defined(CUPTI_DIRECTIVE_SUPPORT)
+#include <Openacc/cupti_openacc.h>
+#include <Openmp/cupti_openmp.h>
+#endif
+
+#include <cupti_common.h>
+
+#define CUPTI_UNIFIED_MEMORY_CPU_DEVICE_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CONTEXT_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_STREAM_ID ((uint32_t) 0xFFFFFFFFU)
+#define CUPTI_INVALID_CHANNEL_ID ((uint32_t) 0xFFFFFFFFU)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#define invalidNumaId ((uint32_t) 0xFFFFFFFF)
+
+/**
+ * \defgroup CUPTI_ACTIVITY_API CUPTI Activity API
+ * Functions, types, and enums that implement the CUPTI Activity API.
+ * @{
+ */
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_Activity
+ * \see CUpti_ActivityAPI
+ * \see CUpti_ActivityContext
+ * \see CUpti_ActivityContext2
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityDeviceAttribute
+ * \see CUpti_ActivityEvent
+ * \see CUpti_ActivityEventInstance
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityKernel9
+ * \see CUpti_ActivityCdpKernel
+ * \see CUpti_ActivityPreemption
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpy5
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemcpyPtoP4
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemset4
+ * \see CUpti_ActivityMemory
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemory3
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMemoryPool2
+ * \see CUpti_ActivityMetric
+ * \see CUpti_ActivityMetricInstance
+ * \see CUpti_ActivityName
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityMarker2
+ * \see CUpti_ActivityMarkerData
+ * \see CUpti_ActivitySourceLocator
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityGlobalAccess3
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityBranch2
+ * \see CUpti_ActivityOverhead3
+ * \see CUpti_ActivityEnvironment
+ * \see CUpti_ActivityInstructionExecution
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityFunction
+ * \see CUpti_ActivityModule
+ * \see CUpti_ActivitySharedAccess
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityPCSampling3
+ * \see CUpti_ActivityPCSamplingRecordInfo
+ * \see CUpti_ActivityCudaEvent
+ * \see CUpti_ActivityStream
+ * \see CUpti_ActivitySynchronization
+ * \see CUpti_ActivityInstructionCorrelation
+ * \see CUpti_ActivityExternalCorrelation
+ * \see CUpti_ActivityUnifiedMemoryCounter2
+ * \see CUpti_ActivityOpenAccData
+ * \see CUpti_ActivityOpenAccLaunch
+ * \see CUpti_ActivityOpenAccOther
+ * \see CUpti_ActivityOpenMp
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ * \see CUpti_ActivityNvLink4
+ * \see CUpti_ActivityPcie
+ */
+
+typedef enum {
+  /**
+   * The activity record is invalid.
+   */
+  CUPTI_ACTIVITY_KIND_INVALID  = 0,
+
+  /**
+   * A host<->host, host<->device, or device<->device memory copy. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityMemcpy5.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY   = 1,
+
+  /**
+   * A memory set executing on the GPU. The corresponding activity
+   * record structure is \ref CUpti_ActivityMemset4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMSET   = 2,
+
+  /**
+   * A kernel executing on the GPU. This activity kind may significantly change
+   * the overall performance characteristics of the application because all
+   * kernel executions are serialized on the GPU. Other activity kind for kernel
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL doesn't break kernel concurrency.
+   * The corresponding activity record structure is \ref CUpti_ActivityKernel9.
+   */
+  CUPTI_ACTIVITY_KIND_KERNEL   = 3,
+
+  /**
+   * A CUDA driver API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_DRIVER   = 4,
+
+  /**
+   * A CUDA runtime API function execution. The corresponding activity
+   * record structure is \ref CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_RUNTIME  = 5,
+
+  /**
+   * An event value. The corresponding activity record structure is
+   * \ref CUpti_ActivityEvent.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT    = 6,
+
+  /**
+   * A metric value. The corresponding activity record structure is
+   * \ref CUpti_ActivityMetric.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC   = 7,
+
+  /**
+   * Information about a device. The corresponding activity record
+   * structure is \ref CUpti_ActivityDevice5.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE   = 8,
+
+  /**
+   * Information about a context. The corresponding activity record
+   * structure is \ref CUpti_ActivityContext2.
+   */
+  CUPTI_ACTIVITY_KIND_CONTEXT  = 9,
+
+  /**
+   * A kernel executing on the GPU. This activity kind doesn't break
+   * kernel concurrency. The corresponding activity record structure
+   * is \ref CUpti_ActivityKernel9.
+   */
+  CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL = 10,
+
+  /**
+   * Resource naming done via NVTX APIs for thread, device, context, etc.
+   * The corresponding activity record structure is \ref CUpti_ActivityName.
+   */
+  CUPTI_ACTIVITY_KIND_NAME     = 11,
+
+  /**
+   * Instantaneous, start, or end NVTX marker. The corresponding activity
+   * record structure is \ref CUpti_ActivityMarker2.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER = 12,
+
+  /**
+   * Extended, optional, data about a marker. The corresponding
+   * activity record structure is \ref CUpti_ActivityMarkerData.
+   */
+  CUPTI_ACTIVITY_KIND_MARKER_DATA = 13,
+
+  /**
+   * Source information about source level result. The corresponding
+   * activity record structure is \ref CUpti_ActivitySourceLocator.
+   */
+  CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR = 14,
+
+  /**
+   * Results for source-level global access. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS = 15,
+
+  /**
+   * Results for source-level branch. The corresponding
+   * activity record structure is \ref CUpti_ActivityBranch2.
+   */
+  CUPTI_ACTIVITY_KIND_BRANCH = 16,
+
+  /**
+   * Overhead activity records. The
+   * corresponding activity record structure is
+   * \ref CUpti_ActivityOverhead3.
+   */
+  CUPTI_ACTIVITY_KIND_OVERHEAD = 17,
+
+  /**
+   * A CDP (CUDA Dynamic Parallel) kernel executing on the GPU. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityCdpKernel.  This activity can not be directly
+   * enabled or disabled. It is enabled and disabled through
+   * concurrent kernel activity i.e. _CONCURRENT_KERNEL.
+   */
+  CUPTI_ACTIVITY_KIND_CDP_KERNEL = 18,
+  /**
+   * Preemption activity record indicating a preemption of a CDP (CUDA
+   * Dynamic Parallel) kernel executing on the GPU. The corresponding
+   * activity record structure is \ref CUpti_ActivityPreemption.
+   */
+  CUPTI_ACTIVITY_KIND_PREEMPTION = 19,
+
+  /**
+   * Environment activity records indicating power, clock, thermal,
+   * etc. levels of the GPU. The corresponding activity record
+   * structure is \ref CUpti_ActivityEnvironment.
+   */
+  CUPTI_ACTIVITY_KIND_ENVIRONMENT = 20,
+
+  /**
+   * An event value associated with a specific event domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityEventInstance.
+   */
+  CUPTI_ACTIVITY_KIND_EVENT_INSTANCE = 21,
+
+  /**
+   * A peer to peer memory copy. The corresponding activity record
+   * structure is \ref CUpti_ActivityMemcpyPtoP4.
+   */
+  CUPTI_ACTIVITY_KIND_MEMCPY2 = 22,
+
+  /**
+   * A metric value associated with a specific metric domain
+   * instance. The corresponding activity record structure is \ref
+   * CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_KIND_METRIC_INSTANCE = 23,
+
+  /**
+   * Results for source-level instruction execution.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionExecution.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION = 24,
+
+  /**
+   * Unified Memory counter record. The corresponding activity
+   * record structure is \ref CUpti_ActivityUnifiedMemoryCounter2.
+   */
+  CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER = 25,
+
+  /**
+   * Device global/function record. The corresponding activity
+   * record structure is \ref CUpti_ActivityFunction.
+   */
+  CUPTI_ACTIVITY_KIND_FUNCTION = 26,
+
+  /**
+   * CUDA Module record. The corresponding activity
+   * record structure is \ref CUpti_ActivityModule.
+   */
+  CUPTI_ACTIVITY_KIND_MODULE = 27,
+
+  /**
+   * A device attribute value. The corresponding activity record
+   * structure is \ref CUpti_ActivityDeviceAttribute.
+   */
+  CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE   = 28,
+
+  /**
+   * Results for source-level shared access. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_KIND_SHARED_ACCESS = 29,
+
+  /**
+   * Enable PC sampling for kernels. This will serialize
+   * kernels. The corresponding activity record structure
+   * is \ref CUpti_ActivityPCSampling3.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING = 30,
+
+  /**
+   * Summary information about PC sampling records. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityPCSamplingRecordInfo.
+   */
+  CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO = 31,
+
+  /**
+   * SASS/Source line-by-line correlation record.
+   * This will generate sass/source correlation for functions that have source
+   * level analysis or pc sampling results. The records will be generated only
+   * when either of source level analysis or pc sampling activity is enabled.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstructionCorrelation.
+   */
+  CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION = 32,
+
+  /**
+   * OpenACC data events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccData.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_DATA = 33,
+
+  /**
+   * OpenACC launch events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccLaunch.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH = 34,
+
+  /**
+   * OpenACC other events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenAccOther.
+   */
+  CUPTI_ACTIVITY_KIND_OPENACC_OTHER = 35,
+
+  /**
+   * Information about a CUDA event. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityCudaEvent.
+   */
+  CUPTI_ACTIVITY_KIND_CUDA_EVENT = 36,
+
+  /**
+   * Information about a CUDA stream. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityStream.
+   */
+  CUPTI_ACTIVITY_KIND_STREAM = 37,
+
+  /**
+   * Records for synchronization management. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivitySynchronization.
+   */
+  CUPTI_ACTIVITY_KIND_SYNCHRONIZATION = 38,
+
+  /**
+   * Records for correlation of different programming APIs. The
+   * corresponding activity record structure is \ref
+   * CUpti_ActivityExternalCorrelation.
+   */
+  CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION = 39,
+
+  /**
+   * NVLink information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityNvLink4.
+   */
+  CUPTI_ACTIVITY_KIND_NVLINK = 40,
+
+  /**
+   * Instantaneous Event information.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEvent.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT = 41,
+
+  /**
+   * Instantaneous Event information for a specific event
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousEventInstance
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE = 42,
+
+  /**
+   * Instantaneous Metric information
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetric.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC = 43,
+
+  /**
+   * Instantaneous Metric information for a specific metric
+   * domain instance.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityInstantaneousMetricInstance.
+   */
+  CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE = 44,
+
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY = 45,
+
+  /**
+   * PCI devices information used for PCI topology.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityPcie.
+   */
+  CUPTI_ACTIVITY_KIND_PCIE = 46,
+
+  /**
+   * OpenMP parallel events.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityOpenMp.
+   */
+  CUPTI_ACTIVITY_KIND_OPENMP = 47,
+
+  /**
+   * A CUDA driver kernel launch occurring outside of any
+   * public API function execution.  Tools can handle these
+   * like records for driver API launch functions, although
+   * the cbid field is not used here.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityAPI.
+   */
+  CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API = 48,
+
+  /**
+   * Memory activity tracking allocation and freeing of the memory
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemory3.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY2 = 49,
+
+  /**
+   * Memory pool activity tracking creation, destruction and
+   * trimming of the memory pool.
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityMemoryPool2.
+   */
+  CUPTI_ACTIVITY_KIND_MEMORY_POOL = 50,
+
+  /**
+   * The corresponding activity record structure is
+   * \ref CUpti_ActivityGraphTrace2.
+   */
+  CUPTI_ACTIVITY_KIND_GRAPH_TRACE = 51,
+
+  /**
+   * JIT operation tracking
+   * The corresponding activity record structure is \ref
+   * CUpti_ActivityJit.
+   */
+  CUPTI_ACTIVITY_KIND_JIT = 52,
+
+  CUPTI_ACTIVITY_KIND_COUNT,
+
+  CUPTI_ACTIVITY_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityKind;
+
+/**
+ * \brief The kinds of activity objects.
+ * \see CUpti_ActivityObjectKindId
+ */
+typedef enum {
+  /**
+   * The object kind is not known.
+   */
+  CUPTI_ACTIVITY_OBJECT_UNKNOWN  = 0,
+
+  /**
+   * A process.
+   */
+  CUPTI_ACTIVITY_OBJECT_PROCESS  = 1,
+
+  /**
+   * A thread.
+   */
+  CUPTI_ACTIVITY_OBJECT_THREAD   = 2,
+
+  /**
+   * A device.
+   */
+  CUPTI_ACTIVITY_OBJECT_DEVICE   = 3,
+
+  /**
+   * A context.
+   */
+  CUPTI_ACTIVITY_OBJECT_CONTEXT  = 4,
+
+  /**
+   * A stream.
+   */
+  CUPTI_ACTIVITY_OBJECT_STREAM   = 5,
+
+  CUPTI_ACTIVITY_OBJECT_FORCE_INT = 0x7fffffff
+} CUpti_ActivityObjectKind;
+
+/**
+ * \brief Identifiers for object kinds as specified by
+ * CUpti_ActivityObjectKind.
+ * \see CUpti_ActivityObjectKind
+ */
+typedef union {
+  /**
+   * A process object requires that we identify the process ID. A
+   * thread object requires that we identify both the process and
+   * thread ID.
+   */
+  struct {
+    uint32_t processId;
+    uint32_t threadId;
+  } pt;
+
+  /**
+   * A device object requires that we identify the device ID. A
+   * context object requires that we identify both the device and
+   * context ID. A stream object requires that we identify device,
+   * context, and stream ID.
+   */
+  struct {
+    uint32_t deviceId;
+    uint32_t contextId;
+    uint32_t streamId;
+  } dcs;
+} CUpti_ActivityObjectKindId;
+
+/**
+ * \brief The structure to provide additional data for CUPTI_ACTIVITY_OVERHEAD_COMMAND_BUFFER_FULL.
+ */
+typedef struct {
+  /**
+   * The length of the command buffer.
+   *
+   */
+  uint32_t commandBufferLength;
+  /**
+   * The channel ID of the command buffer.
+   *
+   */
+  uint32_t channelID;
+  /**
+   * The channel type of the command buffer.
+   *
+   */
+  uint32_t channelType;
+} CUpti_ActivityOverheadCommandBufferFullData;
+
+/**
+ * \brief The kinds of activity overhead.
+ */
+typedef enum {
+  /**
+   * The overhead kind is not known.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_UNKNOWN               = 0,
+
+  /**
+   * Compiler overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER       = 1,
+
+  /**
+   * Activity buffer flush overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH               = 1<<16,
+
+  /**
+   * CUPTI instrumentation overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION            = 2<<16,
+
+  /**
+   * CUPTI resource creation and destruction overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE                   = 3<<16,
+
+  /**
+   * CUDA Runtime triggered module loading overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_RUNTIME_TRIGGERED_MODULE_LOADING = 4<<16,
+
+  /**
+   * Lazy function loading overhead.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_LAZY_FUNCTION_LOADING            = 5<<16,
+
+  /**
+   * Overhead due to lack of command buffer space.
+   * Refer CUpti_ActivityOverheadCommandBufferFullData for more details.
+   */
+  CUPTI_ACTIVITY_OVERHEAD_COMMAND_BUFFER_FULL              = 6<<16,
+
+  CUPTI_ACTIVITY_OVERHEAD_FORCE_INT             = 0x7fffffff
+} CUpti_ActivityOverheadKind;
+
+/**
+ * \brief The kind of a compute API.
+ */
+typedef enum {
+  /**
+   * The compute API is not known.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_UNKNOWN    = 0,
+
+  /**
+   * The compute APIs are for CUDA.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA       = 1,
+
+  /**
+   * The compute APIs are for CUDA running
+   * in MPS (Multi-Process Service) environment.
+   */
+  CUPTI_ACTIVITY_COMPUTE_API_CUDA_MPS   = 2,
+
+  CUPTI_ACTIVITY_COMPUTE_API_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityComputeApiKind;
+
+/**
+ * \brief Flags associated with activity records.
+ *
+ * Activity record flags. Flags can be combined by bitwise OR to
+ * associated multiple flags with an activity record. Each flag is
+ * specific to a certain activity kind, as noted below.
+ */
+typedef enum {
+  /**
+   * Indicates the activity record has no flags.
+   */
+  CUPTI_ACTIVITY_FLAG_NONE          = 0,
+
+  /**
+   * Indicates the activity represents a device that supports
+   * concurrent kernel execution. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_CONCURRENT_KERNELS  = 1 << 0,
+
+  /**
+   * Indicates if the activity represents a CUdevice_attribute value
+   * or a CUpti_DeviceAttribute value. Valid for
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an asynchronous memcpy
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents an instantaneous marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_INSTANTANEOUS  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a region start marker. Valid
+   * for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_START  = 1 << 1,
+
+  /**
+   * Indicates the activity represents a region end marker. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_END  = 1 << 2,
+
+  /**
+   * Indicates the activity represents an attempt to acquire a user
+   * defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE = 1 << 3,
+
+  /**
+   * Indicates the activity represents success in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_SUCCESS = 1 << 4,
+
+  /**
+   * Indicates the activity represents failure in acquiring the
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_ACQUIRE_FAILED = 1 << 5,
+
+  /**
+   * Indicates the activity represents releasing a reservation on
+   * user defined synchronization object.
+   * Valid for CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_SYNC_RELEASE = 1 << 6,
+
+  /**
+   * Indicates the activity represents a marker that does not specify
+   * a color. Valid for CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_NONE  = 1 << 0,
+
+  /**
+   * Indicates the activity represents a marker that specifies a color
+   * in alpha-red-green-blue format. Valid for
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUPTI_ACTIVITY_FLAG_MARKER_COLOR_ARGB  = 1 << 1,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * If this bit in flag is set, the load access was cached else it is
+   * uncached. Valid for CUpti_ActivityGlobalAccess3.
+   */
+  CUPTI_ACTIVITY_FLAG_GLOBAL_ACCESS_KIND_CACHED     = 1 << 9,
+
+  /**
+   * If this bit in flag is set, the metric value overflowed. Valid
+   * for CUpti_ActivityMetric and CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_OVERFLOWED     = 1 << 0,
+
+  /**
+   * If this bit in flag is set, the metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * metric is missing.  Valid for CUpti_ActivityMetric and
+   * CUpti_ActivityMetricInstance.
+   */
+  CUPTI_ACTIVITY_FLAG_METRIC_VALUE_INVALID  = 1 << 1,
+
+  /**
+   * If this bit in flag is set, the source level metric value couldn't be
+   * calculated. This occurs when a value(s) required to calculate the
+   * source level metric cannot be evaluated.
+   * Valid for CUpti_ActivityInstructionExecution.
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_VALUE_INVALID  = 1 << 0,
+
+  /**
+   * The mask for the instruction class, \ref CUpti_ActivityInstructionClass
+   * Valid for CUpti_ActivityInstructionExecution and
+   * CUpti_ActivityInstructionCorrelation
+   */
+  CUPTI_ACTIVITY_FLAG_INSTRUCTION_CLASS_MASK    = 0xFF << 1,
+
+  /**
+   * When calling cuptiActivityFlushAll, this flag
+   * can be set to force CUPTI to flush all records in the buffer, whether
+   * finished or not
+   */
+  CUPTI_ACTIVITY_FLAG_FLUSH_FORCED = 1 << 0,
+
+  /**
+   * The number of bytes requested by each thread
+   * Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_SIZE_MASK  = 0xFF << 0,
+
+  /**
+   * If bit in this flag is set, the access was load, else it is a
+   * store access.  Valid for CUpti_ActivitySharedAccess.
+   */
+  CUPTI_ACTIVITY_FLAG_SHARED_ACCESS_KIND_LOAD       = 1 << 8,
+
+  /**
+   * Indicates the activity represents an asynchronous memset
+   * operation. Valid for CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC  = 1 << 0,
+
+  /**
+   * Indicates the activity represents thrashing in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THRASHING_IN_CPU = 1 << 0,
+
+  /**
+   * Indicates the activity represents page throttling in CPU.
+   * Valid for counter of kind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING in
+   * CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUPTI_ACTIVITY_FLAG_THROTTLING_IN_CPU = 1 << 0,
+
+  CUPTI_ACTIVITY_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_ActivityFlag;
+
+/**
+ * \brief The stall reason for PC sampling activity.
+ */
+typedef enum {
+  /**
+   * Invalid reason
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID      = 0,
+
+  /**
+   * No stall, instruction is selected for issue
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE         = 1,
+
+  /**
+   * Warp is blocked because next instruction is not yet available,
+   * because of instruction cache miss, or because of branching effects
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH   = 2,
+
+  /**
+   * Instruction is waiting on an arithmetic dependency
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY   = 3,
+
+  /**
+   * Warp is blocked because it is waiting for a memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY   = 4,
+
+  /**
+   * Texture sub-system is fully utilized or has too many outstanding requests.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE   = 5,
+
+  /**
+   * Warp is blocked as it is waiting at __syncthreads() or at memory barrier.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC   = 6,
+
+  /**
+   * Warp is blocked waiting for __constant__ memory and immediate memory access to complete.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY   = 7,
+
+  /**
+   * Compute operation cannot be performed due to the required resources not
+   * being available.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY   = 8,
+
+  /**
+   * Warp is blocked because there are too many pending memory operations.
+   * In Kepler architecture it often indicates high number of memory replays.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE   = 9,
+
+  /**
+   * Warp was ready to issue, but some other warp issued instead.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED   = 10,
+
+  /**
+   * Miscellaneous reasons
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER   = 11,
+
+  /**
+   * Sleeping.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING   = 12,
+
+  CUPTI_ACTIVITY_PC_SAMPLING_STALL_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPCSamplingStallReason;
+
+/**
+ * \brief Sampling period for PC sampling method
+ *
+ * Sampling period can be set using \ref cuptiActivityConfigurePCSampling
+ */
+typedef enum {
+  /**
+   * The PC sampling period is not set.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_INVALID = 0,
+
+  /**
+   * Minimum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN = 1,
+
+  /**
+   * Sampling period in lower range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_LOW = 2,
+
+  /**
+   * Medium sampling period.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MID = 3,
+
+  /**
+   * Sampling period in higher range.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_HIGH = 4,
+
+  /**
+   * Maximum sampling period available on the device.
+   */
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MAX = 5,
+
+  CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_FORCE_INT = 0x7fffffff
+} CUpti_ActivityPCSamplingPeriod;
+
+/**
+ * \brief The kind of a memory copy, indicating the source and
+ * destination targets of the copy.
+ *
+ * Each kind represents the source and destination targets of a memory
+ * copy. Targets are host, device, and array.
+ */
+typedef enum {
+  /**
+   * The memory copy kind is not known.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN = 0,
+
+  /**
+   * A host to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOD    = 1,
+
+  /**
+   * A device to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOH    = 2,
+
+  /**
+   * A host to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOA    = 3,
+
+  /**
+   * A device array to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOH    = 4,
+
+  /**
+   * A device array to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOA    = 5,
+
+  /**
+   * A device array to device memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_ATOD    = 6,
+
+  /**
+   * A device to device array memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOA    = 7,
+
+  /**
+   * A device to device memory copy on the same device.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_DTOD    = 8,
+
+  /**
+   * A host to host memory copy.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_HTOH    = 9,
+
+  /**
+   * A peer to peer memory copy across different devices.
+   */
+  CUPTI_ACTIVITY_MEMCPY_KIND_PTOP    = 10,
+
+  CUPTI_ACTIVITY_MEMCPY_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemcpyKind;
+
+/**
+ * \brief The kinds of memory accessed by a memory operation/copy.
+ *
+ * Each kind represents the type of the memory
+ * accessed by a memory operation/copy.
+ */
+typedef enum {
+  /**
+   * The memory kind is unknown.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN            = 0,
+
+  /**
+   * The memory is pageable.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE           = 1,
+
+  /**
+   * The memory is pinned.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_PINNED             = 2,
+
+  /**
+   * The memory is on the device.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE             = 3,
+
+  /**
+   * The memory is an array.
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_ARRAY              = 4,
+
+  /**
+   * The memory is managed
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED            = 5,
+
+  /**
+   * The memory is device static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC      = 6,
+
+  /**
+   * The memory is managed static
+   */
+  CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC     = 7,
+
+  CUPTI_ACTIVITY_MEMORY_KIND_FORCE_INT          = 0x7fffffff
+} CUpti_ActivityMemoryKind;
+
+/**
+ * \brief The kind of a preemption activity.
+ */
+typedef enum {
+  /**
+   * The preemption kind is not known.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_UNKNOWN    = 0,
+
+  /**
+   * Preemption to save CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_SAVE       = 1,
+
+  /**
+   * Preemption to restore CDP block.
+   */
+  CUPTI_ACTIVITY_PREEMPTION_KIND_RESTORE    = 2,
+
+  CUPTI_ACTIVITY_PREEMPTION_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityPreemptionKind;
+
+/**
+ * \brief The kind of environment data. Used to indicate what type of
+ * data is being reported by an environment activity record.
+ */
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_UNKNOWN = 0,
+
+  /**
+   * The environment data is related to speed.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_SPEED = 1,
+
+  /**
+   * The environment data is related to temperature.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE = 2,
+
+  /**
+   * The environment data is related to power.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_POWER = 3,
+
+  /**
+   * The environment data is related to cooling.
+   */
+  CUPTI_ACTIVITY_ENVIRONMENT_COOLING = 4,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_COUNT,
+
+  CUPTI_ACTIVITY_ENVIRONMENT_KIND_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityEnvironmentKind;
+
+/**
+ * \brief Reasons for clock throttling.
+ *
+ * The possible reasons that a clock can be throttled. There can be
+ * more than one reason that a clock is being throttled so these types
+ * can be combined by bitwise OR.  These are used in the
+ * clocksThrottleReason field in the Environment Activity Record.
+ */
+typedef enum {
+  /**
+   * Nothing is running on the GPU and the clocks are dropping to idle
+   * state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_GPU_IDLE              = 0x00000001,
+
+  /**
+   * The GPU clocks are limited by a user specified limit.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS   = 0x00000002,
+
+  /**
+   * A software power scaling algorithm is reducing the clocks below
+   * requested clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_SW_POWER_CAP          = 0x00000004,
+
+  /**
+   * Hardware slowdown to reduce the clock by a factor of two or more
+   * is engaged.  This is an indicator of one of the following: 1)
+   * Temperature is too high, 2) External power brake assertion is
+   * being triggered (e.g. by the system power supply), 3) Change in
+   * power state.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN           = 0x00000008,
+
+  /**
+   * Some unspecified factor is reducing the clocks.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNKNOWN               = 0x80000000,
+
+  /**
+   * Throttle reason is not supported for this GPU.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_UNSUPPORTED           = 0x40000000,
+
+  /**
+   * No clock throttling.
+   */
+  CUPTI_CLOCKS_THROTTLE_REASON_NONE                  = 0x00000000,
+
+  CUPTI_CLOCKS_THROTTLE_REASON_FORCE_INT             = 0x7fffffff
+} CUpti_EnvironmentClocksThrottleReason;
+
+/**
+ * \brief Scope of the unified memory counter (deprecated in CUDA 7.0)
+ */
+typedef enum {
+  /**
+   * The unified memory counter scope is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_UNKNOWN = 0,
+
+  /**
+   * Collect unified memory counter for single process on one device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE = 1,
+
+  /**
+   * Collect unified memory counter for single process across all devices
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_ALL_DEVICES = 2,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_COUNT,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterScope;
+
+/**
+ * \brief Kind of the Unified Memory counter
+ *
+ * Many activities are associated with Unified Memory mechanism; among them
+ * are transfers from host to device, device to host, page fault at
+ * host side.
+ */
+typedef enum {
+  /**
+   * The unified memory counter kind is not known.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_UNKNOWN = 0,
+
+  /**
+   * Number of bytes transferred from host to device
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD = 1,
+
+  /**
+   * Number of bytes transferred from device to host
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH = 2,
+
+  /**
+   * Number of CPU page faults, this is only supported on 64 bit
+   * Linux and Mac platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT = 3,
+
+  /**
+   * Number of GPU page faults, this is only supported on devices with
+   * compute capability 6.0 and higher and 64 bit Linux platforms
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT = 4,
+
+  /**
+   * Thrashing occurs when data is frequently accessed by
+   * multiple processors and has to be constantly migrated around
+   * to achieve data locality. In this case the overhead of migration
+   * may exceed the benefits of locality.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING = 5,
+
+  /**
+   * Throttling is a prevention technique used by the driver to avoid
+   * further thrashing. Here, the driver doesn't service the fault for
+   * one of the contending processors for a specific period of time,
+   * so that the other processor can run at full-speed.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING = 6,
+
+  /**
+   * In case throttling does not help, the driver tries to pin the memory
+   * to a processor for a specific period of time. One of the contending
+   * processors will have slow  access to the memory, while the other will
+   * have fast access.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP = 7,
+
+  /**
+   * Number of bytes transferred from one device to another device.
+   * This is only supported on 64 bit Linux platforms.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD = 8,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_COUNT,
+
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_FORCE_INT = 0x7fffffff
+} CUpti_ActivityUnifiedMemoryCounterKind;
+
+/**
+ * \brief Memory access type for unified memory page faults
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+ * and \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+ */
+typedef enum {
+  /**
+   * The unified memory access type is not known
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_UNKNOWN = 0,
+
+  /**
+   * The page fault was triggered by read memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_READ = 1,
+
+  /**
+   * The page fault was triggered by write memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_WRITE = 2,
+
+  /**
+   * The page fault was triggered by atomic memory instruction
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_ATOMIC = 3,
+
+  /**
+   * The page fault was triggered by memory prefetch operation
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_ACCESS_TYPE_PREFETCH = 4
+} CUpti_ActivityUnifiedMemoryAccessType;
+
+/**
+ * \brief Migration cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+ * \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH
+ */
+typedef enum {
+  /**
+   * The unified memory migration cause is not known
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_UNKNOWN = 0,
+
+  /**
+   * The unified memory migrated due to an explicit call from
+   * the user e.g. cudaMemPrefetchAsync
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_USER = 1,
+
+  /**
+   * The unified memory migrated to guarantee data coherence
+   * e.g. CPU/GPU faults on Pascal+ and kernel launch on pre-Pascal GPUs
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_COHERENCE = 2,
+
+  /**
+   * The unified memory was speculatively migrated by the UVM driver
+   * before being accessed by the destination processor to improve
+   * performance
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_PREFETCH = 3,
+
+  /**
+   * The unified memory migrated to the CPU because it was evicted to make
+   * room for another block of memory on the GPU
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_EVICTION = 4,
+
+  /**
+    * The unified memory migrated to another processor because of access counter
+    * notifications. Only frequently accessed pages are migrated between CPU and GPU, or
+    * between peer GPUs.
+    */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_MIGRATION_CAUSE_ACCESS_COUNTERS = 5,
+} CUpti_ActivityUnifiedMemoryMigrationCause;
+
+/**
+ * \brief Remote memory map cause of the Unified Memory counter
+ *
+ * This is valid for \ref CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP
+ */
+typedef enum {
+  /**
+   * The cause of mapping to remote memory was unknown
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_UNKNOWN = 0,
+
+  /**
+   * Mapping to remote memory was added to maintain data coherence.
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_COHERENCE = 1,
+
+  /**
+   * Mapping to remote memory was added to prevent further thrashing
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_THRASHING = 2,
+
+  /**
+   * Mapping to remote memory was added to enforce the hints
+   * specified by the programmer or by performance heuristics of the
+   * UVM driver
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_POLICY = 3,
+
+  /**
+   * Mapping to remote memory was added because there is no more
+   * memory available on the processor and eviction was not
+   * possible
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_OUT_OF_MEMORY = 4,
+
+  /**
+   * Mapping to remote memory was added after the memory was
+   * evicted to make room for another block of memory on the GPU
+   */
+  CUPTI_ACTIVITY_UNIFIED_MEMORY_REMOTE_MAP_CAUSE_EVICTION = 5,
+} CUpti_ActivityUnifiedMemoryRemoteMapCause;
+
+/**
+ * \brief SASS instruction classification.
+ *
+ * The sass instruction are broadly divided into different class. Each enum represents a classification.
+ */
+typedef enum {
+  /**
+   * The instruction class is not known.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNKNOWN = 0,
+
+  /**
+   * Represents a 32 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_32 = 1,
+
+  /**
+   * Represents a 64 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_64 = 2,
+
+  /**
+   * Represents an integer operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTEGER = 3,
+
+  /**
+   * Represents a bit conversion operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BIT_CONVERSION = 4,
+
+  /**
+   * Represents a control flow instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONTROL_FLOW = 5,
+
+  /**
+   * Represents a global load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL = 6,
+
+  /**
+   * Represents a shared load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED = 7,
+
+  /**
+   * Represents a local load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_LOCAL = 8,
+
+  /**
+   * Represents a generic load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GENERIC = 9,
+
+  /**
+   * Represents a surface load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE = 10,
+
+  /**
+   * Represents a constant load instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_CONSTANT = 11,
+
+  /**
+   * Represents a texture load-store instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_TEXTURE = 12,
+
+  /**
+   * Represents a global atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_GLOBAL_ATOMIC = 13,
+
+  /**
+   * Represents a shared atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SHARED_ATOMIC = 14,
+
+  /**
+   * Represents a surface atomic instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_SURFACE_ATOMIC = 15,
+
+  /**
+   * Represents a inter-thread communication instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_INTER_THREAD_COMMUNICATION = 16,
+
+  /**
+   * Represents a barrier instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_BARRIER = 17,
+
+  /**
+   * Represents some miscellaneous instructions which do not fit in the above classification.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_MISCELLANEOUS = 18,
+
+  /**
+   * Represents a 16 bit floating point operation.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_FP_16 = 19,
+
+  /**
+   * Represents uniform instruction.
+   */
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_UNIFORM = 20,
+
+  CUPTI_ACTIVITY_INSTRUCTION_CLASS_KIND_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityInstructionClass;
+
+/**
+ * \brief Partitioned global caching option
+ */
+typedef enum {
+  /**
+   * Partitioned global cache config unknown.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_UNKNOWN       = 0,
+
+  /**
+   * Partitioned global cache not supported.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_NOT_SUPPORTED = 1,
+
+  /**
+   * Partitioned global cache config off.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_OFF           = 2,
+
+  /**
+   * Partitioned global cache config on.
+   */
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_ON            = 3,
+
+  CUPTI_ACTIVITY_PARTITIONED_GLOBAL_CACHE_CONFIG_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityPartitionedGlobalCacheConfig;
+
+/**
+ * \brief Synchronization type.
+ *
+ * The types of synchronization to be used with CUpti_ActivitySynchronization.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_UNKNOWN             = 0,
+
+  /**
+   * Event synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE   = 1,
+
+  /**
+   * Stream wait event API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT   = 2,
+
+  /**
+   * Stream synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE  = 3,
+
+  /**
+   * Context synchronize API.
+   */
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE = 4,
+
+  CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_FORCE_INT           = 0x7fffffff
+} CUpti_ActivitySynchronizationType;
+
+/**
+ * \brief stream type.
+ *
+ * The types of stream to be used with CUpti_ActivityStream.
+ */
+
+typedef enum {
+  /**
+   * Unknown data.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_UNKNOWN      = 0,
+
+  /**
+   * Default stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_DEFAULT      = 1,
+
+  /**
+   * Non-blocking stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NON_BLOCKING = 2,
+
+  /**
+   * Null stream.
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_NULL         = 3,
+
+  /**
+   * Stream create Mask
+   */
+  CUPTI_ACTIVITY_STREAM_CREATE_MASK              = 0xFFFF,
+
+  CUPTI_ACTIVITY_STREAM_CREATE_FLAG_FORCE_INT    = 0x7fffffff
+} CUpti_ActivityStreamFlag;
+
+/**
+* \brief Link flags.
+*
+* Describes link properties, to be used with CUpti_ActivityNvLink.
+*/
+
+typedef enum {
+  /**
+   * The flag is invalid.
+   */
+  CUPTI_LINK_FLAG_INVALID        = 0,
+
+  /**
+  * Is peer to peer access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ACCESS    = (1 << 1),
+
+  /**
+  * Is system memory access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ACCESS  = (1 << 2),
+
+  /**
+  * Is peer atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_PEER_ATOMICS   = (1 << 3),
+
+  /**
+  * Is system memory atomic access supported by this link.
+  */
+  CUPTI_LINK_FLAG_SYSMEM_ATOMICS = (1 << 4),
+
+  CUPTI_LINK_FLAG_FORCE_INT = 0x7fffffff
+} CUpti_LinkFlag;
+
+/**
+* \brief Memory operation types.
+*
+* Describes the type of memory operation, to be used with CUpti_ActivityMemory3.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_INVALID   = 0,
+
+  /**
+  * Memory is allocated.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION = 1,
+
+  /**
+  * Memory is released.
+  */
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE    = 2,
+
+  CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_FORCE_INT  = 0x7fffffff
+} CUpti_ActivityMemoryOperationType;
+
+/**
+* \brief Memory pool types.
+*
+* Describes the type of memory pool, to be used with CUpti_ActivityMemory3.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_INVALID   = 0,
+
+  /**
+  * Memory pool is local to the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL     = 1,
+
+  /**
+  * Memory pool is imported by the process.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED  = 2,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolType;
+
+/**
+* \brief Memory pool operation types.
+*
+* Describes the type of memory pool operation, to be used with CUpti_ActivityMemoryPool2.
+*/
+
+typedef enum {
+  /**
+   * The operation is invalid.
+   */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_INVALID   = 0,
+
+  /**
+  * Memory pool is created.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_CREATED   = 1,
+
+  /**
+  * Memory pool is destroyed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_DESTROYED = 2,
+
+  /**
+  * Memory pool is trimmed.
+  */
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED   = 3,
+
+  CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityMemoryPoolOperationType;
+
+typedef enum {
+  CUPTI_CHANNEL_TYPE_INVALID      = 0,
+
+  CUPTI_CHANNEL_TYPE_COMPUTE      = 1,
+
+  CUPTI_CHANNEL_TYPE_ASYNC_MEMCPY = 2
+} CUpti_ChannelType;
+
+/**
+ * The source-locator ID that indicates an unknown source
+ * location. There is not an actual CUpti_ActivitySourceLocator object
+ * corresponding to this value.
+ */
+#define CUPTI_SOURCE_LOCATOR_ID_UNKNOWN 0
+
+/**
+ * An invalid function index ID.
+ */
+#define CUPTI_FUNCTION_INDEX_ID_INVALID 0
+
+/**
+ * An invalid/unknown correlation ID. A correlation ID of this value
+ * indicates that there is no correlation for the activity record.
+ */
+#define CUPTI_CORRELATION_ID_UNKNOWN 0
+
+/**
+ * An invalid/unknown grid ID.
+ */
+#define CUPTI_GRID_ID_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown timestamp for a start, end, queued, submitted,
+ * or completed time.
+ */
+#define CUPTI_TIMESTAMP_UNKNOWN 0LL
+
+/**
+ * An invalid/unknown value.
+ */
+#define CUPTI_SYNCHRONIZATION_INVALID_VALUE -1
+
+/**
+ * An invalid/unknown process id.
+ */
+#define CUPTI_AUTO_BOOST_INVALID_CLIENT_PID 0
+
+/**
+ * Invalid/unknown NVLink port number.
+*/
+#define CUPTI_NVLINK_INVALID_PORT -1
+
+/**
+ * Maximum NVLink port numbers.
+*/
+#define CUPTI_MAX_NVLINK_PORTS 32
+
+START_PACKED_ALIGNMENT
+/**
+ * \brief Unified Memory counters configuration structure
+ *
+ * This structure controls the enable/disable of the various
+ * Unified Memory counters consisting of scope, kind and other parameters.
+ * See function \ref cuptiActivityConfigureUnifiedMemoryCounter
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Unified Memory counter Counter scope. (deprecated in CUDA 7.0)
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * Unified Memory counter Counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind kind;
+
+  /**
+   * Device id of the target device. This is relevant only
+   * for single device scopes. (deprecated in CUDA 7.0)
+   */
+  uint32_t deviceId;
+
+  /**
+   * Control to enable/disable the counter. To enable the counter
+   * set it to non-zero value while disable is indicated by zero.
+   */
+  uint32_t enable;
+} CUpti_ActivityUnifiedMemoryCounterConfig;
+
+/**
+ * \brief Device auto boost state structure
+ *
+ * This structure defines auto boost state for a device.
+ * See function \ref cuptiGetAutoBoostState
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Returned auto boost state. 1 is returned in case auto boost is enabled, 0
+   * otherwise
+   */
+  uint32_t enabled;
+
+  /**
+   * Id of process that has set the current boost state. The value will be
+   * CUPTI_AUTO_BOOST_INVALID_CLIENT_PID if the user does not have the
+   * permission to query process ids or there is an error in querying the
+   * process id.
+   */
+  uint32_t pid;
+
+} CUpti_ActivityAutoBoostState;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure defines the pc sampling configuration.
+ *
+ * See function \ref cuptiActivityConfigurePCSampling
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  uint32_t size;
+
+  /**
+   * There are 5 level provided for sampling period. The level
+   * internally maps to a period in terms of cycles. Same level can
+   * map to different number of cycles on different gpus. No of
+   * cycles will be chosen to minimize information loss. The period
+   * chosen will be given by samplingPeriodInCycles in
+   * \ref CUpti_ActivityPCSamplingRecordInfo for each kernel instance.
+   */
+  CUpti_ActivityPCSamplingPeriod samplingPeriod;
+
+  /**
+   * This will override the period set by samplingPeriod. Value 0 in samplingPeriod2 will be
+   * considered as samplingPeriod2 should not be used and samplingPeriod should be used.
+   * Valid values for samplingPeriod2 are between 5 to 31 both inclusive.
+   * This will set the sampling period to (2^samplingPeriod2) cycles.
+   */
+  uint32_t samplingPeriod2;
+} CUpti_ActivityPCSamplingConfig;
+
+/**
+ * \brief The base activity record.
+ *
+ * The activity API uses a CUpti_Activity as a generic representation
+ * for any activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_Activity object can
+ * be cast to the specific activity record type appropriate for that kind.
+ *
+ * Note that all activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+} CUpti_Activity;
+
+/**
+ * \brief The activity record for memory copies.
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Reserved for internal use.
+   */
+  uint32_t pad2;
+} CUpti_ActivityMemcpy5;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory copy is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityMemcpyPtoP4;
+
+/**
+ * \brief The activity record for memset.
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the HW channel on which the memory set is occurring.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   *  Undefined. Reserved for internal use
+   */
+  uint32_t pad2;
+} CUpti_ActivityMemset4;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY).
+ * This activity record provides a single record for the memory
+ * allocation and memory release operations.
+ *
+ * Note: It is recommended to move to the new activity record \ref CUpti_ActivityMemory3
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY2.
+ * \ref CUpti_ActivityMemory3 provides separate records for memory
+ * allocation and memory release operations. This allows to correlate the
+ * corresponding driver and runtime API activity record with the memory operation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory kind requested by the user
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The virtual address of the allocation
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, i.e.
+   * the time when memory was allocated, in ns.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory operation, i.e.
+   * the time when memory was freed, in ns.
+   * This will be 0 if memory is not freed in the application
+   */
+  uint64_t end;
+
+  /**
+   * The program counter of the allocation of memory
+   */
+  uint64_t allocPC;
+
+  /**
+   * The program counter of the freeing of memory. This will
+   * be 0 if memory is not freed in the application
+   */
+  uint64_t freePC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory allocation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+} CUpti_ActivityMemory;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct PACKED_ALIGNMENT {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+    /**
+     * The size of memory pool in bytes and the processId of the memory pools
+     * \p size is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     * \p processId is valid if \p memoryPoolType is
+     * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType
+     */
+    union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+
+    /**
+     * The utilized size of the memory pool. \p utilizedSize is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t utilizedSize;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory3;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The utilized size of the memory pool. \p utilizedSize is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t utilizedSize;
+} CUpti_ActivityMemoryPool2;
+
+/**
+ * \brief The type of the CUDA kernel launch.
+ */
+typedef enum {
+  /**
+  * The kernel was launched via a regular kernel call
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_REGULAR = 0,
+
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernel() or
+  * \ref cuLaunchCooperativeKernel()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_SINGLE_DEVICE = 1,
+
+  /**
+  * The kernel was launched via API \ref cudaLaunchCooperativeKernelMultiDevice() or
+  * \ref cuLaunchCooperativeKernelMultiDevice()
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_COOPERATIVE_MULTI_DEVICE = 2,
+
+  /**
+  * The kernel was launched as a CBL commandlist
+  */
+  CUPTI_ACTIVITY_LAUNCH_TYPE_CBL_COMMANDLIST = 3,
+} CUpti_ActivityLaunchType;
+
+/**
+ * \brief The shared memory limit per block config for a kernel
+ * This should be used to set 'cudaOccFuncShmemConfig' field in occupancy calculator API
+ */
+typedef enum  {
+    /** The shared memory limit config is default
+     */
+    CUPTI_FUNC_SHMEM_LIMIT_DEFAULT              = 0x00,
+
+    /** User has opted for a higher dynamic shared memory limit using function attribute
+     * 'cudaFuncAttributeMaxDynamicSharedMemorySize' for runtime API or
+     * CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES for driver API
+     */
+    CUPTI_FUNC_SHMEM_LIMIT_OPTIN                = 0x01,
+
+    CUPTI_FUNC_SHMEM_LIMIT_FORCE_INT            = 0x7fffffff
+} CUpti_FuncShmemLimitConfig;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+
+  /**
+   * The maximum cluster size for the kernel
+   */
+  uint32_t maxPotentialClusterSize;
+
+  /**
+   * The maximum clusters that could co-exist on the target device for the kernel
+   */
+  uint32_t maxActiveClusters;
+} CUpti_ActivityKernel9;
+
+/**
+ * \brief The activity record for CDP (CUDA Dynamic Parallelism)
+ * kernel.
+ *
+ * This activity record represents a CDP kernel execution.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CDP_KERNEL
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel execution
+   * is assigned a unique grid ID.
+   */
+  int64_t gridId;
+
+  /**
+   * The grid ID of the parent kernel.
+   */
+  int64_t parentGridId;
+
+  /**
+   * The timestamp when kernel is queued up, in ns. A value of
+   * CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time is
+   * unknown.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when kernel is submitted to the gpu, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the submission time is
+   * unknown.
+   */
+  uint64_t submitted;
+
+  /**
+   * The timestamp when kernel is marked as completed, in ns. A value
+   * of CUPTI_TIMESTAMP_UNKNOWN indicates that the completion time is
+   * unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The X-dimension of the parent block.
+   */
+  uint32_t parentBlockX;
+
+  /**
+   * The Y-dimension of the parent block.
+   */
+  uint32_t parentBlockY;
+
+  /**
+   * The Z-dimension of the parent block.
+   */
+  uint32_t parentBlockZ;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityCdpKernel;
+
+/**
+ * \brief The activity record for a preemption of a CDP kernel.
+ *
+ * This activity record represents a preemption of a CDP kernel.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PREEMPTION
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * kind of the preemption
+  */
+  CUpti_ActivityPreemptionKind preemptionKind;
+
+  /**
+   * The timestamp of the preemption, in ns. A value of 0 indicates
+   * that timestamp information could not be collected for the
+   * preemption.
+   */
+  uint64_t timestamp;
+
+  /**
+  * The grid-id of the block that is preempted
+  */
+  int64_t gridId;
+
+  /**
+   * The X-dimension of the block that is preempted
+   */
+  uint32_t blockX;
+
+  /**
+   * The Y-dimension of the block that is preempted
+   */
+  uint32_t blockY;
+
+  /**
+   * The Z-dimension of the block that is preempted
+   */
+  uint32_t blockZ;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityPreemption;
+
+/**
+ * \brief The activity record for a driver or runtime API invocation.
+ *
+ * This activity record represents an invocation of a driver or
+ * runtime API (CUPTI_ACTIVITY_KIND_DRIVER and
+ * CUPTI_ACTIVITY_KIND_RUNTIME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DRIVER,
+   * CUPTI_ACTIVITY_KIND_RUNTIME, or CUPTI_ACTIVITY_KIND_INTERNAL_LAUNCH_API.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the driver or runtime function.
+   */
+  CUpti_CallbackId cbid;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the driver or runtime CUDA function
+   * is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the driver or runtime CUDA function is
+   * executing.
+   */
+  uint32_t threadId;
+
+  /**
+   * The correlation ID of the driver or runtime CUDA function. Each
+   * function invocation is assigned a unique correlation ID that is
+   * identical to the correlation ID in the memcpy, memset, or kernel
+   * activity record that is associated with this function.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The return value for the function. For a CUDA driver function
+   * with will be a CUresult value, and for a CUDA runtime function
+   * this will be a cudaError_t value.
+   */
+  uint32_t returnValue;
+} CUpti_ActivityAPI;
+
+/**
+ * \brief The activity record for a CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * event data may choose to use this type to store the collected event
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityEvent;
+
+/**
+ * \brief The activity record for a CUPTI event with instance
+ * information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE). This activity record kind is
+ * not produced by the activity API but is included for completeness
+ * and ease-of-use. Profile frameworks built on top of CUPTI that
+ * collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event domain ID.
+   */
+  CUpti_EventDomainID domain;
+
+  /**
+   * The event domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The correlation ID of the event. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the event was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityEventInstance;
+
+/**
+ * \brief The activity record for a CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * metric data may choose to use this type to store the collected metric
+ * data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[3];
+} CUpti_ActivityMetric;
+
+/**
+ * \brief The activity record for a CUPTI metric with instance
+ * information.
+ *
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE).  This activity record kind
+ * is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profile frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The metric domain instance.
+   */
+  uint32_t instance;
+
+  /**
+   * The correlation ID of the metric. Use of this ID is user-defined,
+   * but typically this ID value will equal the correlation ID of the
+   * kernel for which the metric was gathered.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t pad[7];
+} CUpti_ActivityMetricInstance;
+
+/**
+ * \brief The activity record for source locator.
+ *
+ * This activity record represents a source locator
+ * (CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for the source path, will be used in all the source level
+   * results.
+   */
+  uint32_t id;
+
+  /**
+   * The line number in the source .
+   */
+  uint32_t lineNumber;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The path for the file.
+   */
+  const char *fileName;
+} CUpti_ActivitySourceLocator;
+
+/**
+ * \brief The activity record for source-level global
+ * access.
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint64_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number of
+   * threads that executed this instruction with predicate and condition code
+   * evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this
+     access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+} CUpti_ActivityGlobalAccess3;
+
+/**
+ * \brief The activity record for source level result
+ * branch.
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityBranch2;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+  /**
+   * Numa (Non-uniform memory access) information for device
+   * GPU is a NUMA node or not
+  */
+  uint32_t isNumaNode;
+
+  /**
+   * Numa (Non-uniform memory access) information for device
+   * NUMA node ID of the GPU memory
+   * if GPU is not a NUMA node, it returns invalidNumaId
+  */
+  uint32_t numaId;
+} CUpti_ActivityDevice5;
+
+/**
+ * \brief The activity record for a device attribute.
+ *
+ * This activity record represents information about a GPU device:
+ * either a CUpti_DeviceAttribute or CUdevice_attribute value
+ * (CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_DEVICE_ATTRIBUTE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID of the device that this attribute applies to.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The attribute, either a CUpti_DeviceAttribute or
+   * CUdevice_attribute. Flag
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is used to indicate
+   * what kind of attribute this is. If
+   * CUPTI_ACTIVITY_FLAG_DEVICE_ATTRIBUTE_CUDEVICE is 1 then
+   * CUdevice_attribute field is value, otherwise
+   * CUpti_DeviceAttribute field is valid.
+   */
+  union {
+    CUdevice_attribute cu;
+    CUpti_DeviceAttribute cupti;
+  } attribute;
+
+  /**
+   * The value for the attribute. See CUpti_DeviceAttribute and
+   * CUdevice_attribute for the type of the value for a given
+   * attribute.
+   */
+  union {
+    double vDouble;
+    uint32_t vUint32;
+    uint64_t vUint64;
+    int32_t vInt32;
+    int64_t vInt64;
+  } value;
+} CUpti_ActivityDeviceAttribute;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+
+  /**
+   * The ID of the parent context. It would be 0 if
+   * context does not have parent
+   */
+  uint32_t parentContextId;
+
+  /**
+   * This field indicates whether the context is a green context
+   */
+  uint8_t isGreenContext;
+
+  uint8_t padding[3];
+} CUpti_ActivityContext2;
+
+/**
+ * \brief The activity record providing a name.
+ *
+ * This activity record provides a name for a device, context, thread,
+ * etc. and other resource naming done via NVTX APIs
+ * (CUPTI_ACTIVITY_KIND_NAME).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NAME.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of activity object being named.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name.
+   */
+  const char *name;
+
+} CUpti_ActivityName;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time.
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+  /**
+   * The name of the domain to which this marker belongs to.
+   * This will be NULL for default domain.
+   */
+  const char *domain;
+
+} CUpti_ActivityMarker2;
+
+/**
+ * \brief The activity record providing detailed information for a marker.
+ *
+ * The marker data contains color, payload, and category.
+ * (CUPTI_ACTIVITY_KIND_MARKER_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be
+   * CUPTI_ACTIVITY_KIND_MARKER_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * Defines the payload format for the value associated with the marker.
+   */
+  CUpti_MetricValueKind payloadKind;
+
+  /**
+   * The payload value.
+   */
+  CUpti_MetricValue payload;
+
+  /**
+   * The color for the marker.
+   */
+  uint32_t color;
+
+  /**
+   * The category for the marker.
+   */
+  uint32_t category;
+
+} CUpti_ActivityMarkerData;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+
+  /**
+   * Pointer to the struct with additional details about the overhead.
+   * Refer CUpti_ActivityOverheadKind enum and the corresponding structure to typecast and access additional overhead data.
+   * Client is responsible for freeing this memory using the free function when done.
+   */
+  void *overheadData;
+
+} CUpti_ActivityOverhead3;
+
+/**
+ * \brief The activity record for CUPTI environmental data.
+ *
+ * This activity record provides CUPTI environmental data, include
+ * power, clocks, and thermals.  This information is sampled at
+ * various rates and returned in this activity record.  The consumer
+ * of the record needs to check the environmentKind field to figure
+ * out what kind of environmental record this is.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_ENVIRONMENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the device
+   */
+  uint32_t deviceId;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected for
+   * the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The kind of data reported in this record.
+   */
+  CUpti_ActivityEnvironmentKind environmentKind;
+
+  union {
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_SPEED environment
+     * kind.
+     */
+    struct {
+      /**
+       * The SM frequency in MHz
+       */
+      uint32_t smClock;
+
+      /**
+       * The memory frequency in MHz
+       */
+      uint32_t memoryClock;
+
+      /**
+       * The PCIe link generation.
+       */
+      uint32_t pcieLinkGen;
+
+      /**
+       * The PCIe link width.
+       */
+      uint32_t pcieLinkWidth;
+
+      /**
+       * The clocks throttle reasons.
+       */
+      CUpti_EnvironmentClocksThrottleReason clocksThrottleReasons;
+    } speed;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_TEMPERATURE
+     * environment kind.
+     */
+    struct {
+      /**
+       * The GPU temperature in degrees C.
+       */
+      uint32_t gpuTemperature;
+    } temperature;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_POWER environment kind.
+     * The power in milliwatts consumed by GPU and associated circuitry.
+     * The power in milliwatts that will trigger power management algorithm.
+     */
+    struct {
+
+      uint32_t power;
+      uint32_t powerLimit;
+    } power;
+
+    /**
+     * Data returned for CUPTI_ACTIVITY_ENVIRONMENT_COOLING
+     * environment kind.
+     */
+    struct {
+      /**
+       * The fan speed as percentage of maximum.
+       */
+      uint32_t fanSpeed;
+    } cooling;
+  } data;
+} CUpti_ActivityEnvironment;
+
+/**
+ * \brief The activity record for source-level instruction execution.
+ *
+ * This activity records result for source level instruction execution.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_EXECUTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction execution.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction, regardless of predicate or condition code.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t notPredOffThreadsExecuted;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionExecution;
+
+/**
+ * \brief The activity record for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint64_t pcOffset;
+} CUpti_ActivityPCSampling3;
+
+/**
+ * \brief The activity record for record status for PC sampling.
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Number of times the PC was sampled for this kernel instance including all
+   * dropped samples.
+   */
+  uint64_t totalSamples;
+
+  /**
+   * Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * Sampling period in terms of number of cycles .
+   */
+  uint64_t samplingPeriodInCycles;
+} CUpti_ActivityPCSamplingRecordInfo;
+
+/**
+ * \brief The activity record for Unified Memory counters (CUDA 7.0 and beyond)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Value of the counter
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH,
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THREASHING and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP, it is the size of the
+   * memory region in bytes.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, it
+   * is the number of page fault groups for the same page.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT,
+   * it is the program counter for the instruction that caused fault.
+   */
+  uint64_t value;
+
+  /**
+   * The start timestamp of the counter, in ns.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity starts on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT, timestamp is
+   * captured when CUDA driver started processing the fault.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, timestamp
+   * is captured when CUDA driver detected thrashing of memory region.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING,
+   * timestamp is captured when throttling operation was started by CUDA driver.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP,
+   * timestamp is captured when CUDA driver has pushed all required operations
+   * to the processor specified by dstId.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp of the counter, in ns.
+   * Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD and
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH, timestamp is
+   * captured when activity finishes on GPU.
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT, timestamp is
+   * captured when CUDA driver queues the replay of faulting memory accesses on the GPU
+   * For counterKind CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING, timestamp
+   * is captured when throttling operation was finished by CUDA driver
+   */
+  uint64_t end;
+
+  /**
+   * This is the virtual base address of the page/s being transferred. For cpu and
+   * gpu faults, the virtual address for the page that faulted.
+   */
+  uint64_t address;
+
+  /**
+   * The ID of the source CPU/device involved in the memory transfer, page fault, thrashing,
+   * throttling or remote map operation. For counterKind
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING, it is a bitwise ORing of the
+   * device IDs fighting for the memory region. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT
+   */
+  uint32_t srcId;
+
+  /**
+   * The ID of the destination CPU/device involved in the memory transfer or remote map
+   * operation. Ignore this field if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t dstId;
+
+  /**
+   * The ID of the stream causing the transfer.
+   * This value of this field is invalid.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The flags associated with this record. See enums \ref CUpti_ActivityUnifiedMemoryAccessType
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT
+   * and \ref CUpti_ActivityUnifiedMemoryMigrationCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD
+   * and \ref CUpti_ActivityUnifiedMemoryRemoteMapCause if counterKind is
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP and \ref CUpti_ActivityFlag
+   * if counterKind is CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING or
+   * CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING
+   */
+  uint32_t flags;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter2;
+
+/**
+ * \brief The activity record for global/device functions.
+ *
+ * This activity records function name and corresponding module
+ * information.
+ * (CUPTI_ACTIVITY_KIND_FUNCTION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_FUNCTION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+  * ID to uniquely identify the record
+  */
+  uint32_t id;
+
+  /**
+   * The ID of the context where the function is launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID in which this global/device function is present.
+   */
+  uint32_t moduleId;
+
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The name of the function. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityFunction;
+
+/**
+ * \brief The activity record for a CUDA module.
+ *
+ * This activity record represents a CUDA module
+ * (CUPTI_ACTIVITY_KIND_MODULE). This activity record kind is not
+ * produced by the activity API but is included for completeness and
+ * ease-of-use. Profile frameworks built on top of CUPTI that collect
+ * module data from the module callback may choose to use this type to
+ * store the collected module data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MODULE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID of the context where the module is loaded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The module ID.
+   */
+  uint32_t id;
+
+  /**
+   * The cubin size.
+   */
+  uint32_t cubinSize;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The pointer to cubin.
+   */
+  const void *cubin;
+} CUpti_ActivityModule;
+
+/**
+ * \brief The activity record for source-level shared
+ * access.
+ *
+ * This activity records the locations of the shared
+ * accesses in the source
+ * (CUPTI_ACTIVITY_KIND_SHARED_ACCESS).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SHARED_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this shared access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of shared memory transactions generated by this access
+   */
+  uint64_t sharedTransactions;
+
+  /**
+   * The minimum number of shared memory transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalSharedTransactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivitySharedAccess;
+
+/**
+ * \brief The activity record for CUDA event.
+ *
+ * This activity is used to track recorded events.
+ * (CUPTI_ACTIVITY_KIND_CUDA_EVENT).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CUDA_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context where the event was recorded.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream where the event was recorded.
+   */
+  uint32_t streamId;
+
+  /**
+   * A unique event ID to identify the event record.
+   */
+  uint32_t eventId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityCudaEvent;
+
+/**
+ * \brief The activity record for CUDA stream.
+ *
+ * This activity is used to track created streams.
+ * (CUPTI_ACTIVITY_KIND_STREAM).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_STREAM.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * The ID of the context where the stream was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * A unique stream ID to identify the stream.
+   */
+  uint32_t streamId;
+
+  /**
+   * The clamped priority for the stream.
+   */
+  uint32_t priority;
+
+  /**
+   * Flags associated with the stream.
+   */
+  CUpti_ActivityStreamFlag flag;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+} CUpti_ActivityStream;
+
+/**
+ * \brief The activity record for synchronization management.
+ *
+ * This activity is used to track various CUDA synchronization APIs.
+ * (CUPTI_ACTIVITY_KIND_SYNCHRONIZATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_SYNCHRONIZATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The type of record.
+   */
+  CUpti_ActivitySynchronizationType type;
+
+  /**
+   * The start timestamp for the function, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the function.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the function, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the function.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the API to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the context for which the synchronization API is called.
+   * In case of context synchronization API it is the context id for which the API is called.
+   * In case of stream/event synchronization it is the ID of the context where the stream/event was created.
+   */
+  uint32_t contextId;
+
+  /**
+   * The compute stream for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuEventSynchronize.
+   */
+  uint32_t streamId;
+
+  /**
+   * The event ID for which the synchronization API is called.
+   * A CUPTI_SYNCHRONIZATION_INVALID_VALUE value indicate the field is not applicable for this record.
+   * Not valid for cuCtxSynchronize, cuStreamSynchronize.
+   */
+  uint32_t cudaEventId;
+} CUpti_ActivitySynchronization;
+
+/**
+ * \brief The activity record for source-level sass/source
+ * line-by-line correlation.
+ *
+ * This activity records source level sass/source correlation
+ * information.
+ * (CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTRUCTION_CORRELATION.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+ /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityInstructionCorrelation;
+
+/**
+ * \brief The OpenAcc event kind for OpenAcc activity records.
+ *
+ * \see CUpti_ActivityKindOpenAcc
+ */
+typedef enum {
+  CUPTI_OPENACC_EVENT_KIND_INVALID              = 0,
+  CUPTI_OPENACC_EVENT_KIND_DEVICE_INIT          = 1,
+  CUPTI_OPENACC_EVENT_KIND_DEVICE_SHUTDOWN      = 2,
+  CUPTI_OPENACC_EVENT_KIND_RUNTIME_SHUTDOWN     = 3,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_LAUNCH       = 4,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_UPLOAD       = 5,
+  CUPTI_OPENACC_EVENT_KIND_ENQUEUE_DOWNLOAD     = 6,
+  CUPTI_OPENACC_EVENT_KIND_WAIT                 = 7,
+  CUPTI_OPENACC_EVENT_KIND_IMPLICIT_WAIT        = 8,
+  CUPTI_OPENACC_EVENT_KIND_COMPUTE_CONSTRUCT    = 9,
+  CUPTI_OPENACC_EVENT_KIND_UPDATE               = 10,
+  CUPTI_OPENACC_EVENT_KIND_ENTER_DATA           = 11,
+  CUPTI_OPENACC_EVENT_KIND_EXIT_DATA            = 12,
+  CUPTI_OPENACC_EVENT_KIND_CREATE               = 13,
+  CUPTI_OPENACC_EVENT_KIND_DELETE               = 14,
+  CUPTI_OPENACC_EVENT_KIND_ALLOC                = 15,
+  CUPTI_OPENACC_EVENT_KIND_FREE                 = 16,
+  CUPTI_OPENACC_EVENT_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_OpenAccEventKind;
+
+/**
+ * \brief The OpenAcc parent construct kind for OpenAcc activity records.
+ */
+typedef enum {
+  CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN          = 0,
+  CUPTI_OPENACC_CONSTRUCT_KIND_PARALLEL         = 1,
+  CUPTI_OPENACC_CONSTRUCT_KIND_KERNELS          = 2,
+  CUPTI_OPENACC_CONSTRUCT_KIND_LOOP             = 3,
+  CUPTI_OPENACC_CONSTRUCT_KIND_DATA             = 4,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ENTER_DATA       = 5,
+  CUPTI_OPENACC_CONSTRUCT_KIND_EXIT_DATA        = 6,
+  CUPTI_OPENACC_CONSTRUCT_KIND_HOST_DATA        = 7,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ATOMIC           = 8,
+  CUPTI_OPENACC_CONSTRUCT_KIND_DECLARE          = 9,
+  CUPTI_OPENACC_CONSTRUCT_KIND_INIT             = 10,
+  CUPTI_OPENACC_CONSTRUCT_KIND_SHUTDOWN         = 11,
+  CUPTI_OPENACC_CONSTRUCT_KIND_SET              = 12,
+  CUPTI_OPENACC_CONSTRUCT_KIND_UPDATE           = 13,
+  CUPTI_OPENACC_CONSTRUCT_KIND_ROUTINE          = 14,
+  CUPTI_OPENACC_CONSTRUCT_KIND_WAIT             = 15,
+  CUPTI_OPENACC_CONSTRUCT_KIND_RUNTIME_API      = 16,
+  CUPTI_OPENACC_CONSTRUCT_KIND_FORCE_INT        = 0x7fffffff
+
+} CUpti_OpenAccConstructKind;
+
+typedef enum {
+  CUPTI_OPENMP_EVENT_KIND_INVALID               = 0,
+  CUPTI_OPENMP_EVENT_KIND_PARALLEL              = 1,
+  CUPTI_OPENMP_EVENT_KIND_TASK                  = 2,
+  CUPTI_OPENMP_EVENT_KIND_THREAD                = 3,
+  CUPTI_OPENMP_EVENT_KIND_IDLE                  = 4,
+  CUPTI_OPENMP_EVENT_KIND_WAIT_BARRIER          = 5,
+  CUPTI_OPENMP_EVENT_KIND_WAIT_TASKWAIT         = 6,
+  CUPTI_OPENMP_EVENT_KIND_FORCE_INT             = 0x7fffffff
+} CUpti_OpenMpEventKind;
+
+/**
+ * \brief The base activity record for OpenAcc records.
+ *
+ * The OpenACC activity API part uses a CUpti_ActivityOpenAcc as a generic
+ * representation for any OpenACC activity. The 'kind' field is used to determine the
+ * specific activity kind, and from that the CUpti_ActivityOpenAcc object can
+ * be cast to the specific OpenACC activity record type appropriate for that kind.
+ *
+ * Note that all OpenACC activity record types are padded and aligned to
+ * ensure that each member of the record is naturally aligned.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in funcName.
+   * A zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+} CUpti_ActivityOpenAcc;
+
+/**
+ * \brief The activity record for OpenACC data.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_DATA).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_DATA.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /*
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /*
+   * Version number
+   */
+  uint32_t version;
+
+  /*
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /*
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /*
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /*
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /*
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /*
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /*
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /*
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /*
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /*
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /*
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * Number of bytes
+   */
+  uint64_t bytes;
+
+  /**
+   * Host pointer if available
+   */
+  uint64_t hostPtr;
+
+  /**
+   * Device pointer if available
+   */
+  uint64_t devicePtr;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /*
+   * A pointer to null-terminated string containing the name of the variable
+   * for which this event is triggered, if known, or a null pointer if not.
+   */
+  const char *varName;
+
+} CUpti_ActivityOpenAccData;
+
+/**
+ * \brief The activity record for OpenACC launch.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_LAUNCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /**
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /**
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+
+  /**
+   * The number of gangs created for this kernel launch
+   */
+  uint64_t numGangs;
+
+  /**
+   * The number of workers created for this kernel launch
+   */
+  uint64_t numWorkers;
+
+  /**
+   * The number of vector lanes created for this kernel launch
+   */
+  uint64_t vectorLength;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * A pointer to null-terminated string containing the name of the
+   * kernel being launched, if known, or a null pointer if not.
+   */
+  const char *kernelName;
+
+} CUpti_ActivityOpenAccLaunch;
+
+/**
+ * \brief The activity record for OpenACC other.
+ *
+ * (CUPTI_ACTIVITY_KIND_OPENACC_OTHER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_OPENACC_OTHER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenACC event kind (\see CUpti_OpenAccEventKind)
+   */
+  CUpti_OpenAccEventKind eventKind;
+
+  /**
+   * CUPTI OpenACC parent construct kind (\see CUpti_OpenAccConstructKind)
+   *
+   * Note that for applications using PGI OpenACC runtime < 16.1, this
+   * will always be CUPTI_OPENACC_CONSTRUCT_KIND_UNKNOWN.
+   */
+  CUpti_OpenAccConstructKind parentConstruct;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * 1 for any implicit event, such as an implicit wait at a synchronous data construct
+   * 0 otherwise
+   */
+  uint32_t implicit;
+
+  /**
+   * Device type
+   */
+  uint32_t deviceType;
+
+  /**
+   * Device number
+   */
+  uint32_t deviceNumber;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * Value of async() clause of the corresponding directive
+   */
+  uint64_t async;
+
+  /**
+   * Internal asynchronous queue number used
+   */
+  uint64_t asyncMap;
+
+  /**
+   * The line number of the directive or program construct or the starting line
+   * number of the OpenACC construct corresponding to the event.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t lineNo;
+
+  /**
+   * For an OpenACC construct, this contains the line number of the end
+   * of the construct. A negative or zero value means the line number is not known.
+   */
+  uint32_t endLineNo;
+
+  /**
+   * The line number of the first line of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcLineNo;
+
+  /**
+   * The last line number of the function named in func_name.
+   * A negative or zero value means the line number is not known.
+   */
+  uint32_t funcEndLineNo;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * CUDA device id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuDeviceId;
+
+  /**
+   * CUDA context id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuContextId;
+
+  /**
+   * CUDA stream id
+   * Valid only if deviceType is acc_device_nvidia.
+   */
+  uint32_t cuStreamId;
+
+  /**
+   * The ID of the process where the OpenACC activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenACC activity is executing.
+   */
+  uint32_t cuThreadId;
+
+  /**
+   * The OpenACC correlation ID.
+   * Valid only if deviceType is acc_device_nvidia.
+   * If not 0, it uniquely identifies this record. It is identical to the
+   * externalId in the preceding external correlation record of type
+   * CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC.
+   */
+  uint32_t externalId;
+
+  /**
+   * A pointer to null-terminated string containing the name of or path to
+   * the source file, if known, or a null pointer if not.
+   */
+  const char *srcFile;
+
+  /**
+   * A pointer to a null-terminated string containing the name of the
+   * function in which the event occurred.
+   */
+  const char *funcName;
+
+  /* --- end of common CUpti_ActivityOpenAcc part --- */
+} CUpti_ActivityOpenAccOther;
+
+/**
+ * \brief The base activity record for OpenMp records.
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * CUPTI OpenMP event kind (\see CUpti_OpenMpEventKind)
+   */
+  CUpti_OpenMpEventKind eventKind;
+
+  /**
+   * Version number
+   */
+  uint32_t version;
+
+  /**
+   * ThreadId
+   */
+  uint32_t threadId;
+
+  /**
+   * CUPTI start timestamp
+   */
+  uint64_t start;
+
+  /**
+   * CUPTI end timestamp
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the process where the OpenMP activity is executing.
+   */
+  uint32_t cuProcessId;
+
+  /**
+   * The ID of the thread where the OpenMP activity is executing.
+   */
+  uint32_t cuThreadId;
+} CUpti_ActivityOpenMp;
+
+/**
+ * \brief The kind of external APIs supported for correlation.
+ *
+ * Custom correlation kinds are reserved for usage in external tools.
+ *
+ * \see CUpti_ActivityExternalCorrelation
+ */
+typedef enum {
+    CUPTI_EXTERNAL_CORRELATION_KIND_INVALID              = 0,
+
+    /**
+     * The external API is unknown to CUPTI
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN              = 1,
+
+    /**
+     * The external API is OpenACC
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_OPENACC              = 2,
+
+    /**
+     * The external API is custom0
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0              = 3,
+
+    /**
+     * The external API is custom1
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1              = 4,
+
+    /**
+     * The external API is custom2
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM2              = 5,
+
+    /**
+     * Add new kinds before this line
+     */
+    CUPTI_EXTERNAL_CORRELATION_KIND_SIZE,
+
+    CUPTI_EXTERNAL_CORRELATION_KIND_FORCE_INT            = 0x7fffffff
+} CUpti_ExternalCorrelationKind;
+
+/**
+ * \brief The activity record for correlation with external records
+ *
+ * This activity record correlates native CUDA records (e.g. CUDA Driver API,
+ * kernels, memcpys, ...) with records from external APIs such as OpenACC.
+ * (CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION).
+ *
+ * \see CUpti_ActivityKind
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The kind of this activity.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of external API this record correlated to.
+   */
+  CUpti_ExternalCorrelationKind externalKind;
+
+  /**
+   * The correlation ID of the associated non-CUDA API record.
+   * The exact field in the associated external record depends
+   * on that record's activity kind (\see externalKind).
+   */
+  uint64_t externalId;
+
+  /**
+   * The correlation ID of the associated CUDA driver or runtime API record.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t reserved;
+} CUpti_ActivityExternalCorrelation;
+
+/**
+* \brief The device type for device connected to NVLink.
+*/
+typedef enum {
+    CUPTI_DEV_TYPE_INVALID = 0,
+
+    /**
+    * The device type is GPU.
+    */
+    CUPTI_DEV_TYPE_GPU = 1,
+
+    /**
+    * The device type is NVLink processing unit in CPU.
+    */
+    CUPTI_DEV_TYPE_NPU = 2,
+
+    CUPTI_DEV_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_DevType;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+*/
+
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t  physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 32 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t  portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink4;
+
+#define CUPTI_MAX_GPUS 32
+/**
+ * Field to differentiate whether PCIE Activity record
+ * is of a GPU or a PCI Bridge
+ */
+typedef enum {
+    /**
+     * PCIE GPU record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_GPU       = 0,
+
+    /**
+     * PCIE Bridge record
+     */
+    CUPTI_PCIE_DEVICE_TYPE_BRIDGE    = 1,
+
+    CUPTI_PCIE_DEVICE_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_PcieDeviceType;
+
+/**
+ * \brief PCI devices information required to construct topology
+ *
+ * This structure gives capabilities of GPU and PCI bridge connected to the PCIE bus
+ * which can be used to understand the topology.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PCIE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * Type of device in topology, \ref CUpti_PcieDeviceType. If type is
+   * CUPTI_PCIE_DEVICE_TYPE_GPU use devId for id and gpuAttr and if type is
+   * CUPTI_PCIE_DEVICE_TYPE_BRIDGE use bridgeId for id and bridgeAttr.
+   */
+  CUpti_PcieDeviceType type;
+
+  /**
+   * A unique identifier for GPU or Bridge in Topology
+   */
+  union {
+    /**
+     * GPU device ID
+     */
+    CUdevice devId;
+
+    /**
+     * A unique identifier for Bridge in the Topology
+     */
+    uint32_t bridgeId;
+  } id;
+
+  /**
+   * Domain for the GPU or Bridge, required to identify which PCIE bus it belongs to in
+   * multiple NUMA systems.
+   */
+  uint32_t domain;
+
+  /**
+   * PCIE Generation of GPU or Bridge.
+   */
+  uint16_t pcieGeneration;
+
+  /**
+   * Link rate of the GPU or bridge in gigatransfers per second (GT/s)
+   */
+  uint16_t linkRate;
+
+  /**
+   * Link width of the GPU or bridge
+   */
+  uint16_t linkWidth;
+
+  /**
+   * Upstream bus ID for the GPU or PCI bridge. Required to identify which bus it is
+   * connected to in the topology.
+   */
+  uint16_t upstreamBus;
+
+  /**
+   * Attributes for more information about GPU (gpuAttr) or PCI Bridge (bridgeAttr)
+   */
+  union {
+    struct {
+      /**
+       * UUID for the device. \ref CUpti_ActivityDevice5.
+       */
+      CUuuid uuidDev;
+
+      /**
+       * CUdevice with which this device has P2P capability.
+       * This can also be obtained by querying cuDeviceCanAccessPeer or
+       * cudaDeviceCanAccessPeer APIs
+       */
+      CUdevice peerDev[CUPTI_MAX_GPUS];
+    } gpuAttr;
+
+    struct {
+      /**
+       * The downstream bus number, used to search downstream devices/bridges connected
+       * to this bridge.
+       */
+      uint16_t secondaryBus;
+
+      /**
+       * Device ID of the bridge
+       */
+      uint16_t deviceId;
+
+      /**
+       * Vendor ID of the bridge
+       */
+      uint16_t vendorId;
+
+      /**
+       * Padding for alignment
+       */
+      uint16_t pad0;
+    } bridgeAttr;
+  } attr;
+} CUpti_ActivityPcie;
+
+/**
+ * \brief PCIE Generation.
+ *
+ * Enumeration of PCIE Generation for
+ * pcie activity attribute pcieGeneration
+ */
+typedef enum {
+  /**
+  * PCIE Generation 1
+  */
+  CUPTI_PCIE_GEN_GEN1       = 1,
+
+  /**
+  * PCIE Generation 2
+  */
+  CUPTI_PCIE_GEN_GEN2       = 2,
+
+  /**
+  * PCIE Generation 3
+  */
+  CUPTI_PCIE_GEN_GEN3       = 3,
+
+  /**
+  * PCIE Generation 4
+  */
+  CUPTI_PCIE_GEN_GEN4       = 4,
+
+  /**
+  * PCIE Generation 5
+  */
+  CUPTI_PCIE_GEN_GEN5       = 5,
+
+  CUPTI_PCIE_GEN_FORCE_INT  = 0x7fffffff
+} CUpti_PcieGen;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event.
+ *
+ * This activity record represents a CUPTI event value
+ * (CUPTI_ACTIVITY_KIND_EVENT) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data at a particular time may choose to
+ * use this type to store the collected event data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint32_t reserved;
+} CUpti_ActivityInstantaneousEvent;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI event
+ * with event domain instance information.
+ *
+ * This activity record represents the a CUPTI event value for a
+ * specific event domain instance
+ * (CUPTI_ACTIVITY_KIND_EVENT_INSTANCE) sampled at a particular instant.
+ * This activity record kind is not produced by the activity API but is
+ * included for completeness and ease-of-use. Profiler frameworks built on
+ * top of CUPTI that collect event data may choose to use this type to store the
+ * collected event data. This activity record should be used when
+ * event domain instance information needs to be associated with the
+ * event.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_EVENT_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The event ID.
+   */
+  CUpti_EventID id;
+
+  /**
+   * The event value.
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp at which event is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The event domain instance
+   */
+  uint8_t instance;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousEventInstance;
+
+/**
+ * \brief The activity record for an instantaneous CUPTI metric.
+ *
+ * This activity record represents the collection of a CUPTI metric
+ * value (CUPTI_ACTIVITY_KIND_METRIC) at a particular instance.
+ * This activity record kind is not produced by the activity API but
+ * is included for completeness and ease-of-use. Profiler frameworks built
+ * on top of CUPTI that collect metric data may choose to use this type to
+ * store the collected metric data.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[3];
+} CUpti_ActivityInstantaneousMetric;
+
+/**
+ * \brief The instantaneous activity record for a CUPTI metric with instance
+ * information.
+
+ * This activity record represents a CUPTI metric value
+ * for a specific metric domain instance
+ * (CUPTI_ACTIVITY_KIND_METRIC_INSTANCE) sampled at a particular time. This
+ * activity record kind is not produced by the activity API but is included for
+ * completeness and ease-of-use. Profiler frameworks built on top of
+ * CUPTI that collect metric data may choose to use this type to store
+ * the collected metric data. This activity record should be used when
+ * metric domain instance information needs to be associated with the
+ * metric.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_INSTANTANEOUS_METRIC_INSTANCE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The metric ID.
+   */
+  CUpti_MetricID id;
+
+  /**
+   * The metric value.
+   */
+  CUpti_MetricValue value;
+
+  /**
+   * The timestamp at which metric is sampled
+   */
+  uint64_t timestamp;
+
+  /**
+   * The device id
+   */
+  uint32_t deviceId;
+
+  /**
+   * The properties of this metric. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The metric domain instance
+   */
+  uint8_t instance;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[2];
+} CUpti_ActivityInstantaneousMetricInstance;
+
+/**
+ * \brief The types of JIT entry.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+typedef enum {
+  CUPTI_ACTIVITY_JIT_ENTRY_INVALID= 0,
+
+  /**
+  * PTX to CUBIN.
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_PTX_TO_CUBIN = 1,
+
+  /**
+  * NVVM-IR to PTX
+  */
+  CUPTI_ACTIVITY_JIT_ENTRY_NVVM_IR_TO_PTX = 2,
+
+  CUPTI_ACTIVITY_JIT_ENTRY_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitEntryType;
+
+/**
+ * \brief The types of JIT compilation operations.
+ *
+ * To be used in CUpti_ActivityJit.
+ */
+
+typedef enum {
+  CUPTI_ACTIVITY_JIT_OPERATION_INVALID = 0,
+  /**
+  * Loaded from the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_LOAD = 1,
+
+  /**
+  * Stored in the compute cache.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_CACHE_STORE = 2,
+
+  /**
+  * JIT compilation.
+  */
+  CUPTI_ACTIVITY_JIT_OPERATION_COMPILE = 3,
+
+  CUPTI_ACTIVITY_JIT_OPERATION_TYPE_FORCE_INT = 0x7fffffff
+} CUpti_ActivityJitOperationType;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+
+  /**
+   * The ID of the process where the JIT operation is executing.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the thread where the JIT operation is executing.
+   */
+  uint32_t threadId;
+} CUpti_ActivityJit2;
+
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the first node of the graph is executed.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the first node of the graph is executed.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+
+  /**
+   * The ID of the device where last node of the graph is executed
+   */
+  uint32_t endDeviceId;
+
+  /**
+   * The ID of the context where the last node of the graph is executed.
+   */
+  uint32_t endContextId;
+} CUpti_ActivityGraphTrace2;
+
+END_PACKED_ALIGNMENT
+
+/**
+ * \brief Activity attributes.
+ *
+ * These attributes are used to control the behavior of the activity
+ * API.
+ */
+typedef enum {
+    /**
+     * The device memory size (in bytes) reserved for storing profiling data for concurrent
+     * kernels (activity kind \ref CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL), memcopies and memsets
+     * for each buffer on a context. The value is a size_t.
+     *
+     * There is a limit on how many device buffers can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the buffers, it pre-allocates only those many
+     * buffers as set by the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     * When all of the data in a buffer is consumed, it is added in the reuse pool, and
+     * CUPTI picks a buffer from this pool when a new buffer is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels, memcopies and memsets might result in having CUPTI to allocate more device buffers.
+     * CUPTI allocates another buffer only when it runs out of the buffers in the
+     * reuse pool.
+     *
+     * Since buffer allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 buffers of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger buffer size leaves less device memory for the application.
+     * Having smaller buffer size increases the risk of dropping timestamps for
+     * records if too many kernels or memcopies or memsets are launched at one time.
+     *
+     * This value only applies to new buffer allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 3200000 (~3MB) which can accommodate profiling data
+     * up to 100,000 kernels, memcopies and memsets combined.
+     *
+     * Note: Starting with the CUDA 12.0 Update 1 release, CUPTI allocates profiling buffer in the
+     * device memory by default as this might help in improving the performance of the
+     * tracing run. Refer to the description of the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED for more details.
+     * Size of the memory and maximum number of pools are still controlled by the attributes
+     * \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE and \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT.
+     *
+     * Note: The actual amount of device memory per buffer reserved by CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE                      = 0,
+
+    /**
+     * The device memory size (in bytes) reserved for storing profiling
+     * data for CDP operations for each buffer on a context. The
+     * value is a size_t.
+     *
+     * Having larger buffer size means less flush operations but
+     * consumes more device memory. This value only applies to new
+     * allocations.
+     *
+     * Set this value before initializing CUDA or before creating a
+     * context to ensure it is considered for the following allocations.
+     *
+     * The default value is 8388608 (8MB).
+     *
+     * Note: The actual amount of device memory per context reserved by
+     * CUPTI might be larger.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP          = 1,
+
+    /**
+     * The maximum number of device memory buffers per context. The value is a size_t.
+     *
+     * For an application with high rate of kernel launches, memcopies and memsets having a bigger pool
+     * limit helps in timestamp collection for all these activities at the expense of a larger memory footprint.
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for more details.
+     *
+     * Setting this value will not modify the number of memory buffers
+     * currently stored.
+     *
+     * Set this value before initializing CUDA to ensure the limit is
+     * not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT                = 2,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * There is a limit on how many semaphore pools can be allocated per context. User
+     * can query and set this limit using the attribute
+     * \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT.
+     * CUPTI doesn't pre-allocate all the semaphore pools, it pre-allocates only those many
+     * semaphore pools as set by the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     * When all of the data in a semaphore pool is consumed, it is added in the reuse pool, and
+     * CUPTI picks a semaphore pool from the reuse pool when a new semaphore pool is needed. Thus memory
+     * footprint does not scale with the kernel count. Applications with the high density
+     * of kernels might result in having CUPTI to allocate more semaphore pools.
+     * CUPTI allocates another semaphore pool only when it runs out of the semaphore pools in the
+     * reuse pool.
+     *
+     * Since semaphore pool allocation happens in the main application thread, this might result
+     * in stalls in the critical path. CUPTI pre-allocates 3 semaphore pools of the same size to
+     * mitigate this issue. User can query and set the pre-allocation limit using the
+     * attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE.
+     *
+     * Having larger semaphore pool size leaves less device memory for the application.
+     * Having smaller semaphore pool size increases the risk of dropping timestamps for
+     * kernel records if too many kernels are issued/launched at one time.
+     *
+     * This value only applies to new semaphore pool allocations. Set this value before initializing
+     * CUDA or before creating a context to ensure it is considered for the following allocations.
+     *
+     * The default value is 25000 which can accommodate profiling data for upto 25,000 kernels.
+     *
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE           = 3,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * The maximum number of profiling semaphore pools per context. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for more details.
+     *
+     * Set this value before initializing CUDA to ensure the limit is not exceeded.
+     *
+     * The default value is 250.
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT          = 4,
+
+    /**
+     * The flag to indicate whether user should provide activity buffer of zero value.
+     * The value is a uint8_t.
+     *
+     * If the value of this attribute is non-zero, user should provide
+     * a zero value buffer in the \ref CUpti_BuffersCallbackRequestFunc.
+     * If the user does not provide a zero value buffer after setting this to non-zero,
+     * the activity buffer may contain some uninitialized values when CUPTI returns it in
+     * \ref CUpti_BuffersCallbackCompleteFunc
+     *
+     * If the value of this attribute is zero, CUPTI will initialize the user buffer
+     * received in the \ref CUpti_BuffersCallbackRequestFunc to zero before filling it.
+     * If the user sets this to zero, a few stalls may appear in critical path because CUPTI
+     * will zero out the buffer in the main thread.
+     * Set this value before returning from \ref CUpti_BuffersCallbackRequestFunc to
+     * ensure it is considered for all the subsequent user buffers.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_ZEROED_OUT_ACTIVITY_BUFFER              = 5,
+
+    /**
+     * Number of device buffers to pre-allocate for a context during the initialization phase.
+     * The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of device buffers set using
+     * the attribute \ref CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these buffers (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_PRE_ALLOCATE_VALUE        = 6,
+
+    /**
+     * This attribute is not supported starting with CUDA 12.3
+     * CUPTI no longer uses profiling semaphore pool to store profiling data.
+     *
+     * Number of profiling semaphore pools to pre-allocate for a context during the
+     * initialization phase. The value is a size_t.
+     *
+     * Refer to the description of the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE
+     * for details.
+     *
+     * This value must be less than the maximum number of profiling semaphore pools set
+     * using the attribute \ref CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_LIMIT
+     *
+     * Set this value before initializing CUDA or before creating a context to ensure it
+     * is considered by the CUPTI.
+     *
+     * The default value is set to 3 to ping pong between these pools (if possible).
+     */
+    CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_PRE_ALLOCATE_VALUE  = 7,
+
+    /**
+     * Allocate page-locked (pinned) host memory for storing profiling data for concurrent
+     * kernels, memcopies and memsets for each buffer on a context. The value is a uint8_t.
+     *
+     * Starting with the CUDA 11.2 release, CUPTI allocates profiling buffer in the pinned host
+     * memory by default as this might help in improving the performance of the tracing run.
+     * Allocating excessive amounts of pinned memory may degrade system performance, since it
+     * reduces the amount of memory available to the system for paging. For this reason user
+     * might want to change the location from pinned host memory to device memory by setting
+     * value of this attribute to 0.
+     *
+     * Using page-locked (pinned) host memory buffers is not supported on confidential computing
+     * devices. On setting this attribute to 1, CUPTI will return CUPTI_ERROR_NOT_SUPPORTED.
+     *
+     * The default value is 1.
+     */
+    CUPTI_ACTIVITY_ATTR_MEM_ALLOCATION_TYPE_HOST_PINNED         = 8,
+
+    /**
+     * Request activity buffers per-thread to store CUPTI activity records
+     * in the activity buffer on per-thread basis. The value is a uint8_t.
+     *
+     * The attribute should be set before registering the buffer callbacks using
+     * cuptiActivityRegisterCallbacks API and before any of the CUPTI activity kinds are enabled.
+     * This makes sure that all the records are stored in activity buffers allocated per-thread.
+     * Changing this attribute in the middle of the profiling session will result in undefined behavior.
+     *
+     * The default value is 0.
+     */
+    CUPTI_ACTIVITY_ATTR_PER_THREAD_ACTIVITY_BUFFER,
+
+
+
+    CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_FORCE_INT                 = 0x7fffffff
+} CUpti_ActivityAttribute;
+
+/**
+ * \brief Thread-Id types.
+ *
+ * CUPTI uses different methods to obtain the thread-id depending on the
+ * support and the underlying platform. This enum documents these methods
+ * for each type. APIs \ref cuptiSetThreadIdType and \ref cuptiGetThreadIdType
+ * can be used to set and get the thread-id type.
+ */
+typedef enum {
+    /**
+     * Default type
+     * Windows uses API GetCurrentThreadId()
+     * Linux/Mac/Android/QNX use POSIX pthread API pthread_self()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_DEFAULT       = 0,
+
+    /**
+     * This type is based on the system API available on the underlying platform
+     * and thread-id obtained is supposed to be unique for the process lifetime.
+     * Windows uses API GetCurrentThreadId()
+     * Linux uses syscall SYS_gettid
+     * Mac uses syscall SYS_thread_selfid
+     * Android/QNX use gettid()
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SYSTEM        = 1,
+
+    /**
+     * Add new enums before this field.
+     */
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_SIZE          = 2,
+
+    CUPTI_ACTIVITY_THREAD_ID_TYPE_FORCE_INT     = 0x7fffffff
+} CUpti_ActivityThreadIdType;
+
+/**
+ * \brief Get the CUPTI timestamp.
+ *
+ * Returns a timestamp normalized to correspond with the start and end
+ * timestamps reported in the CUPTI activity records. The timestamp is
+ * reported in nanoseconds.
+ *
+ * \param timestamp Returns the CUPTI timestamp
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p timestamp is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetTimestamp(uint64_t *timestamp);
+
+/**
+ * \brief Get the ID of a context.
+ *
+ * Get the ID of a context.
+ *
+ * \param context The context
+ * \param contextId Returns a process-unique ID for the context
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT The context is NULL or not valid.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p contextId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetContextId(CUcontext context, uint32_t *contextId);
+
+/**
+ * \brief Get the ID of a stream.
+ *
+ * Get the ID of a stream. The stream ID is unique within a context
+ * (i.e. all streams within a context will have unique stream
+ * IDs).
+ *
+ * \param context If non-NULL then the stream is checked to ensure
+ * that it belongs to this context. Typically this parameter should be
+ * null.
+ * \param stream The stream
+ * \param streamId Returns a context-unique ID for the stream
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+ * if \p context is non-NULL and \p stream does not belong to the
+ * context
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+ *
+ * **DEPRECATED** This method is deprecated as of CUDA 8.0.
+ * Use method cuptiGetStreamIdEx instead.
+ */
+CUptiResult CUPTIAPI cuptiGetStreamId(CUcontext context, CUstream stream, uint32_t *streamId);
+
+/**
+* \brief Get the ID of a stream.
+*
+* Get the ID of a stream. The stream ID is unique within a context
+* (i.e. all streams within a context will have unique stream
+* IDs).
+*
+* \param context If non-NULL then the stream is checked to ensure
+* that it belongs to this context. Typically this parameter should be
+* null.
+* \param stream The stream
+* \param perThreadStream Flag to indicate if program is compiled for per-thread streams
+* \param streamId Returns a context-unique ID for the stream
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_NOT_INITIALIZED
+* \retval CUPTI_ERROR_INVALID_STREAM if unable to get stream ID, or
+* if \p context is non-NULL and \p stream does not belong to the
+* context
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p streamId is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetStreamIdEx(CUcontext context, CUstream stream, uint8_t perThreadStream, uint32_t *streamId);
+
+/**
+ * \brief Get the ID of a device
+ *
+ * If \p context is NULL, returns the ID of the device that contains
+ * the currently active context. If \p context is non-NULL, returns
+ * the ID of the device which contains that context. Operates in a
+ * similar manner to cudaGetDevice() or cuCtxGetDevice() but may be
+ * called from within callback functions.
+ *
+ * \param context The context, or NULL to indicate the current context.
+ * \param deviceId Returns the ID of the device that is current for
+ * the calling thread.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE if unable to get device ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p deviceId is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetDeviceId(CUcontext context, uint32_t *deviceId);
+
+/**
+ * \brief Get the unique ID of a graph node
+ *
+ * Returns the unique ID of the CUDA graph node.
+ *
+ * \param node The graph node.
+ * \param nodeId Returns the unique ID of the node
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p node is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphNodeId(CUgraphNode node, uint64_t *nodeId);
+
+/**
+ * \brief Get the unique ID of graph
+ *
+ * Returns the unique ID of CUDA graph.
+ *
+ * \param graph The graph.
+ * \param pId Returns the unique ID of the graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphId(CUgraph graph, uint32_t *pId);
+
+/**
+ * \brief Get the unique ID of executable graph
+ *
+ * Returns the unique ID of executable CUDA graph.
+ *
+ * \param graphExec The executable graph.
+ * \param pId Returns the unique ID of the executable graph
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p graph is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetGraphExecId(CUgraphExec graphExec, uint32_t *pId);
+
+/**
+ * \brief Enable collection of a specific kind of activity record.
+ *
+ * Enable collection of a specific kind of activity record. Multiple
+ * kinds can be enabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record. For certain activity kinds
+ * it dumps existing records.
+ *
+ * In general, the behavior of this API is similar to the API \ref cuptiActivityEnable i.e. it
+ * enables the collection of a specific kind of activity record.
+ * Additionally, this API can help in dumping the records for activities which happened in
+ * the past before enabling the corresponding activity kind.
+ * The API allows to get records for the current resource allocations done in CUDA
+ * For CUPTI_ACTIVITY_KIND_DEVICE, existing device records are dumped
+ * For CUPTI_ACTIVITY_KIND_CONTEXT, existing context records are dumped
+ * For CUPTI_ACTIVITY_KIND_STREAM, existing stream records are dumped
+ * For CUPTI_ACTIVITY_KIND_ NVLINK, existing NVLINK records are dumped
+ * For CUPTI_ACTIVITY_KIND_PCIE, existing PCIE records are dumped
+ * For other activities, the behavior is similar to the API \ref cuptiActivityEnable
+ *
+ * Device records are emitted in CUPTI on CUDA driver initialization. Those records
+ * can only be retrieved by the user if CUPTI is attached before CUDA initialization.
+ * Context and stream records are emitted on context and stream creation.
+ * The use case of the API is to provide the records for CUDA resources
+ * (contexts/streams/devices) that are currently active if user late attaches CUPTI.
+ *
+ * Before calling this function, the user must register buffer callbacks
+ * to get the activity records by calling \ref cuptiActivityRegisterCallbacks.
+ * If the user does not register the buffers and calls API \ref cuptiActivityEnableAndDump,
+ * then CUPTI will enable the activity kind but not provide any records for that
+ * activity kind.
+ *
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_UNKNOWN if buffer is not initialized.
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableAndDump(CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record.
+ *
+ * Disable collection of a specific kind of activity record. Multiple
+ * kinds can be disabled by calling this function multiple times. By
+ * default all activity kinds are disabled for collection.
+ *
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisable(CUpti_ActivityKind kind);
+
+/**
+ * \brief Enable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Enable collection of a specific kind of activity record for a
+ * context.  This setting done by this API will supersede the global
+ * settings for activity records enabled by \ref cuptiActivityEnable.
+ * Multiple kinds can be enabled by calling this function multiple
+ * times.
+ *
+ * \param context The context for which activity is to be enabled
+ * \param kind The kind of activity record to collect
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the activity kind cannot be enabled
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Disable collection of a specific kind of activity record for
+ * a context.
+ *
+ * Disable collection of a specific kind of activity record for a context.
+ * This setting done by this API will supersede the global settings
+ * for activity records.
+ * Multiple kinds can be enabled by calling this function multiple times.
+ *
+ * \param context The context for which activity is to be disabled
+ * \param kind The kind of activity record to stop collecting
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_KIND if the activity kind is not supported
+ */
+CUptiResult CUPTIAPI cuptiActivityDisableContext(CUcontext context, CUpti_ActivityKind kind);
+
+/**
+ * \brief Get the number of activity records that were dropped of
+ * insufficient buffer space.
+ *
+ * Get the number of records that were dropped because of insufficient
+ * buffer space.  The dropped count includes records that could not be
+ * recorded because CUPTI did not have activity buffer space available
+ * for the record (because the CUpti_BuffersCallbackRequestFunc
+ * callback did not return an empty buffer of sufficient size) and
+ * also CDP records that could not be record because the device-size
+ * buffer was full (size is controlled by the
+ * CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP attribute). The dropped
+ * count maintained for the queue is reset to zero when this function
+ * is called.
+ *
+ * \param context The context, or NULL to get dropped count from global queue
+ * \param streamId The stream ID
+ * \param dropped The number of records that were dropped since the last call
+ * to this function.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p dropped is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNumDroppedRecords(CUcontext context, uint32_t streamId,
+                                                       size_t *dropped);
+
+/**
+ * \brief Iterate over the activity records in a buffer.
+ *
+ * This is a helper function to iterate over the activity records in a
+ * buffer. A buffer of activity records is typically obtained by
+ * receiving a CUpti_BuffersCallbackCompleteFunc callback. Stop iterating
+ * the buffer when an error occurs.
+ *
+ * An example of typical usage:
+ * \code
+ * CUpti_Activity *record = NULL;
+ * CUptiResult status = CUPTI_SUCCESS;
+ *   do {
+ *      status = cuptiActivityGetNextRecord(buffer, validSize, &record);
+ *      if(status == CUPTI_SUCCESS) {
+ *           // Use record here...
+ *      }
+ *      else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)
+ *          break;
+ *      else if (status == CUPTI_ERROR_INVALID_KIND)
+ *          break;
+ *      else {
+ *          goto Error;
+ *      }
+ *    } while (1);
+ * \endcode
+ *
+ * \param buffer The buffer containing activity records
+ * \param record Inputs the previous record returned by
+ * cuptiActivityGetNextRecord and returns the next activity record
+ * from the buffer. If input value is NULL, returns the first activity
+ * record in the buffer. Records of certain kinds like CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
+ * may contain invalid (0) timestamps, indicating that no timing information could
+ * be collected for lack of device memory.
+ * \param validBufferSizeBytes The number of valid bytes in the buffer.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if no more records in the buffer
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p buffer is NULL.
+ * \retval CUPTI_ERROR_INVALID_KIND if activity record is either incomplete or invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityGetNextRecord(uint8_t* buffer, size_t validBufferSizeBytes,
+                                                CUpti_Activity **record);
+
+/**
+ * \brief Function type for callback used by CUPTI to request an empty
+ * buffer for storing activity records.
+ *
+ * This callback function signals the CUPTI client that an activity
+ * buffer is needed by CUPTI. The activity buffer is used by CUPTI to
+ * store activity records. The callback function can decline the
+ * request by setting \p *buffer to NULL. In this case CUPTI may drop
+ * activity records.
+ *
+ * \param buffer Returns the new buffer. If set to NULL then no buffer
+ * is returned.
+ * \param size Returns the size of the returned buffer.
+ * \param maxNumRecords Returns the maximum number of records that
+ * should be placed in the buffer. If 0 then the buffer is filled with
+ * as many records as possible. If > 0 the buffer is filled with at
+ * most that many records before it is returned.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackRequestFunc)(
+    uint8_t **buffer,
+    size_t *size,
+    size_t *maxNumRecords);
+
+/**
+ * \brief Function type for callback used by CUPTI to return a buffer
+ * of activity records.
+ *
+ * This callback function returns to the CUPTI client a buffer
+ * containing activity records.  The buffer contains \p validSize
+ * bytes of activity records which should be read using
+ * cuptiActivityGetNextRecord. The number of dropped records can be
+ * read using cuptiActivityGetNumDroppedRecords. After this call CUPTI
+ * relinquished ownership of the buffer and will not use it
+ * anymore. The client may return the buffer to CUPTI using the
+ * CUpti_BuffersCallbackRequestFunc callback.
+ * Note: CUDA 6.0 onwards, all buffers returned by this callback are
+ * global buffers i.e. there is no context/stream specific buffer.
+ * User needs to parse the global buffer to extract the context/stream
+ * specific activity records.
+ *
+ * \param context The context this buffer is associated with. If NULL, the
+ * buffer is associated with the global activities. This field is deprecated
+ * as of CUDA 6.0 and will always be NULL.
+ * \param streamId The stream id this buffer is associated with.
+ * This field is deprecated as of CUDA 6.0 and will always be NULL.
+ * \param buffer The activity record buffer.
+ * \param size The total size of the buffer in bytes as set in
+ * CUpti_BuffersCallbackRequestFunc.
+ * \param validSize The number of valid bytes in the buffer.
+ */
+typedef void (CUPTIAPI *CUpti_BuffersCallbackCompleteFunc)(
+    CUcontext context,
+    uint32_t streamId,
+    uint8_t *buffer,
+    size_t size,
+    size_t validSize);
+
+/**
+ * \brief Registers callback functions with CUPTI for activity buffer
+ * handling.
+ *
+ * This function registers two callback functions to be used in asynchronous
+ * buffer handling. If registered, activity record buffers are handled using
+ * asynchronous requested/completed callbacks from CUPTI.
+ *
+ * Registering these callbacks prevents the client from using CUPTI's
+ * blocking enqueue/dequeue functions.
+ *
+ * \param funcBufferRequested callback which is invoked when an empty
+ * buffer is requested by CUPTI
+ * \param funcBufferCompleted callback which is invoked when a buffer
+ * containing activity records is available from CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either \p
+ * funcBufferRequested or \p funcBufferCompleted is NULL
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterCallbacks(CUpti_BuffersCallbackRequestFunc funcBufferRequested,
+        CUpti_BuffersCallbackCompleteFunc funcBufferCompleted);
+
+/**
+ * \brief Wait for all activity records to be delivered via the
+ * completion callback.
+ *
+ * This function does not return until all activity records associated
+ * with the specified context/stream are returned to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks. To
+ * ensure that all activity records are complete, the requested
+ * stream(s), if any, are synchronized.
+ *
+ * If \p context is NULL, the global activity records (i.e. those not
+ * associated with a particular stream) are flushed (in this case no
+ * streams are synchronized).  If \p context is a valid CUcontext and
+ * \p streamId is 0, the buffers of all streams of this context are
+ * flushed.  Otherwise, the buffers of the specified stream in this
+ * context is flushed.
+ *
+ * Before calling this function, the buffer handling callback api
+ * must be activated by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param context A valid CUcontext or NULL.
+ * \param streamId The stream ID.
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_CUPTI_ERROR_INVALID_OPERATION if not preceded
+ * by a successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * **DEPRECATED** This method is deprecated
+ * CONTEXT and STREAMID will be ignored. Use cuptiActivityFlushAll
+ * to flush all data.
+ */
+CUptiResult CUPTIAPI cuptiActivityFlush(CUcontext context, uint32_t streamId, uint32_t flag);
+
+/**
+ * \brief Request to deliver activity records via the buffer completion callback.
+ *
+ * This function returns the activity records associated with all contexts/streams
+ * (and the global buffers not associated with any stream) to the CUPTI client
+ * using the callback registered in cuptiActivityRegisterCallbacks.
+ *
+ * This is a blocking call but it doesn't issue any CUDA synchronization calls
+ * implicitly thus it's not guaranteed that all activities are completed on the
+ * underlying devices. Activity record is considered as completed if it has all
+ * the information filled up including the timestamps if any. It is the client's
+ * responsibility to issue necessary CUDA synchronization calls before calling
+ * this function if all activity records with complete information are expected
+ * to be delivered.
+ *
+ * Behavior of the function based on the input flag:
+ * (-) ::For default flush i.e. when flag is set as 0, it returns all the
+ * activity buffers which have all the activity records completed, buffers need not
+ * to be full though. It doesn't return buffers which have one or more incomplete
+ * records. Default flush can be done at a regular interval in a separate thread.
+ * (-) ::For forced flush i.e. when flag CUPTI_ACTIVITY_FLAG_FLUSH_FORCED is passed
+ * to the function, it returns all the activity buffers including the ones which have
+ * one or more incomplete activity records. It's suggested for clients to do the
+ * force flush before the termination of the profiling session to allow remaining
+ * buffers to be delivered. In general, it can be done in the at-exit handler.
+ *
+ * Before calling this function, the buffer handling callback api must be activated
+ * by calling cuptiActivityRegisterCallbacks.
+ *
+ * \param flag The flag can be set to indicate a forced flush. See CUpti_ActivityFlag
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if not preceded by a
+ * successful call to cuptiActivityRegisterCallbacks
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ *
+ * \see cuptiActivityFlushPeriod
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushAll(uint32_t flag);
+
+/**
+ * \brief Read an activity API attribute.
+ *
+ * Read an activity API attribute and return it in \p *value.
+ *
+ * \param attr The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivityGetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+/**
+ * \brief Write an activity API attribute.
+ *
+ * Write an activity API attribute.
+ *
+ * \param attr The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value is NULL, or
+ * if \p attr is not an activity attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiActivitySetAttribute(CUpti_ActivityAttribute attr,
+        size_t *valueSize, void* value);
+
+
+/**
+ * \brief Set Unified Memory Counter configuration.
+ *
+ * \param config A pointer to \ref CUpti_ActivityUnifiedMemoryCounterConfig structures
+ * containing Unified Memory counter configuration.
+ * \param count Number of Unified Memory counter configuration structures
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED One potential reason is that
+ * platform (OS/arch) does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE Indicates that the device
+ * does not support the unified memory counters
+ * \retval CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES Indicates that
+ * multi-GPU configuration without P2P support between any pair of devices
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigureUnifiedMemoryCounter(CUpti_ActivityUnifiedMemoryCounterConfig *config, uint32_t count);
+
+/**
+ * \brief Get auto boost state
+ *
+ * The profiling results can be inconsistent in case auto boost is enabled.
+ * CUPTI tries to disable auto boost while profiling. It can fail to disable in
+ * cases where user does not have the permissions or CUDA_AUTO_BOOST env
+ * variable is set. The function can be used to query whether auto boost is
+ * enabled.
+ *
+ * \param context A valid CUcontext.
+ * \param state A pointer to \ref CUpti_ActivityAutoBoostState structure which
+ * contains the current state and the id of the process that has requested the
+ * current state
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p CUcontext or \p state is NULL
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the device does not support auto boost
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred
+ */
+CUptiResult CUPTIAPI cuptiGetAutoBoostState(CUcontext context, CUpti_ActivityAutoBoostState *state);
+
+/**
+ * \brief Set PC sampling configuration.
+ *
+ * For Pascal and older GPU architectures this API must be called before enabling
+ * activity kind CUPTI_ACTIVITY_KIND_PC_SAMPLING. There is no such requirement
+ * for Volta and newer GPU architectures.
+ *
+ * For Volta and newer GPU architectures if this API is called in the middle of
+ * execution, PC sampling configuration will be updated for subsequent kernel launches.
+ *
+ * \param ctx The context
+ * \param config A pointer to \ref CUpti_ActivityPCSamplingConfig structure
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this api is called while
+ * some valid event collection method is set.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p config is NULL or
+ * any parameter in the \p config structures is not a valid value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED Indicates that the system/device
+ * does not support the unified memory counters
+ */
+CUptiResult CUPTIAPI cuptiActivityConfigurePCSampling(CUcontext ctx, CUpti_ActivityPCSamplingConfig *config);
+
+/**
+ * \brief Returns the last error from a cupti call or callback
+ *
+ * Returns the last error that has been produced by any of the cupti api calls
+ * or the callback in the same host thread and resets it to CUPTI_SUCCESS.
+ */
+CUptiResult CUPTIAPI cuptiGetLastError(void);
+
+/**
+ * \brief Set the thread-id type
+ *
+ * CUPTI uses the method corresponding to set type to generate the thread-id.
+ * See enum \ref CUpti_ActivityThreadIdType for the list of methods.
+ * Activity records having thread-id field contain the same value.
+ * Thread id type must not be changed during the profiling session to
+ * avoid thread-id value mismatch across activity records.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if \p type is not supported on the platform
+ */
+CUptiResult CUPTIAPI cuptiSetThreadIdType(CUpti_ActivityThreadIdType type);
+
+/**
+ * \brief Get the thread-id type
+ *
+ * Returns the thread-id type used in CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p type is NULL
+  */
+CUptiResult CUPTIAPI cuptiGetThreadIdType(CUpti_ActivityThreadIdType *type);
+
+/**
+* \brief Check support for a compute capability
+*
+* This function is used to check the support for a device based on
+* it's compute capability. It sets the \p support when the compute
+* capability is supported by the current version of CUPTI, and clears
+* it otherwise. This version of CUPTI might not support all GPUs sharing
+* the same compute capability. It is suggested to use API \ref
+* cuptiDeviceSupported which provides correct information.
+*
+* \param major The major revision number of the compute capability
+* \param minor The minor revision number of the compute capability
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+*
+* \sa ::cuptiDeviceSupported
+*/
+CUptiResult CUPTIAPI cuptiComputeCapabilitySupported(int major, int minor, int *support);
+
+/**
+* \brief Check support for a compute device
+*
+* This function is used to check the support for a compute device.
+* It sets the \p support when the device is supported by the current
+* version of CUPTI, and clears it otherwise.
+*
+* \param dev The device handle returned by CUDA Driver API cuDeviceGet
+* \param support Pointer to an integer to return the support status
+*
+* \retval CUPTI_SUCCESS
+* \retval CUPTI_ERROR_INVALID_PARAMETER if \p support is NULL
+* \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+*
+* \sa ::cuptiComputeCapabilitySupported
+*/
+CUptiResult CUPTIAPI cuptiDeviceSupported(CUdevice dev, int *support);
+
+/**
+ * This indicates the virtualization mode in which CUDA device is running
+ */
+typedef enum {
+  /**
+   * No virtualization mode is associated with the device
+   * i.e. it's a baremetal GPU
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_NONE = 0,
+  /**
+   * The device is associated with the pass-through GPU.
+   * In this mode, an entire physical GPU is directly assigned
+   * to one virtual machine (VM).
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_PASS_THROUGH = 1,
+  /**
+   * The device is associated with the virtual GPU (vGPU).
+   * In this mode multiple virtual machines (VMs) have simultaneous,
+   * direct access to a single physical GPU.
+   */
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_VIRTUAL_GPU = 2,
+
+  CUPTI_DEVICE_VIRTUALIZATION_MODE_FORCE_INT = 0x7fffffff
+} CUpti_DeviceVirtualizationMode;
+
+/**
+ * \brief Query the virtualization mode of the device
+ *
+ * This function is used to query the virtualization mode of the CUDA device.
+ *
+ * \param dev The device handle returned by CUDA Driver API cuDeviceGet
+ * \param mode Pointer to an CUpti_DeviceVirtualizationMode to return the virtualization mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_DEVICE if \p dev is not a valid device
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p mode is NULL
+ *
+ */
+CUptiResult CUPTIAPI cuptiDeviceVirtualizationMode(CUdevice dev, CUpti_DeviceVirtualizationMode *mode);
+
+/**
+ * \brief Detach CUPTI from the running process
+ *
+ * This API detaches the CUPTI from the running process. It destroys and cleans up all the
+ * resources associated with CUPTI in the current process. After CUPTI detaches from the process,
+ * the process will keep on running with no CUPTI attached to it.
+ * For safe operation of the API, it is recommended this API is invoked from the exit callsite
+ * of any of the CUDA Driver or Runtime API. Otherwise CUPTI client needs to make sure that
+ * required CUDA synchronization and CUPTI activity buffer flush is done before calling the API.
+ * Sample code showing the usage of the API in the cupti callback handler code:
+ * \code
+  void CUPTIAPI
+  cuptiCallbackHandler(void *userdata, CUpti_CallbackDomain domain,
+      CUpti_CallbackId cbid, void *cbdata)
+  {
+    const CUpti_CallbackData *cbInfo = (CUpti_CallbackData *)cbdata;
+
+    // Take this code path when CUPTI detach is requested
+    if (detachCupti) {
+      switch(domain)
+      {
+        case CUPTI_CB_DOMAIN_RUNTIME_API:
+        case CUPTI_CB_DOMAIN_DRIVER_API:
+          if (cbInfo->callbackSite == CUPTI_API_EXIT) {
+              // call the CUPTI detach API
+              cuptiFinalize();
+          }
+          break;
+        default:
+          break;
+      }
+    }
+  }
+ \endcode
+ */
+CUptiResult CUPTIAPI cuptiFinalize(void);
+
+/**
+ * \brief Push an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is entering an external API region.
+ * When a CUPTI activity API record is created while within an external API region and
+ * CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION is enabled, the activity API record will
+ * be preceded by a CUpti_ActivityExternalCorrelation record for each \ref CUpti_ExternalCorrelationKind.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param id External correlation id.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid
+ */
+CUptiResult CUPTIAPI cuptiActivityPushExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t id);
+
+/**
+ * \brief Pop an external correlation id for the calling thread
+ *
+ * This function notifies CUPTI that the calling thread is leaving an external API region.
+ *
+ * \param kind The kind of external API activities should be correlated with.
+ * \param lastId If the function returns successful, contains the last external correlation id for this \p kind, can be NULL.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER The external API kind is invalid.
+ * \retval CUPTI_ERROR_QUEUE_EMPTY No external id is currently associated with \p kind.
+ */
+CUptiResult CUPTIAPI cuptiActivityPopExternalCorrelationId(CUpti_ExternalCorrelationKind kind, uint64_t *lastId);
+
+/**
+ * \brief Controls the collection of queued and submitted timestamps for kernels.
+ *
+ * This API is used to control the collection of queued and submitted timestamps
+ * for kernels whose records are provided through the struct \ref CUpti_ActivityKernel9.
+ * Default value is 0, i.e. these timestamps are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ *
+ * \param enable is a boolean, denoting whether these timestamps should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLatencyTimestamps(uint8_t enable);
+
+/**
+ * \brief Sets the flush period for the worker thread
+ *
+ * CUPTI creates a worker thread to minimize the perturbance for the application created
+ * threads. CUPTI offloads certain operations from the application threads to the worker
+ * thread, this includes synchronization of profiling resources between host and device,
+ * delivery of the activity buffers to the client using the callback registered in
+ * cuptiActivityRegisterCallbacks. For performance reasons, CUPTI wakes up the worker
+ * thread based on certain heuristics.
+ *
+ * This API is used to control the flush period of the worker thread. This setting will
+ * override the CUPTI heuristics. Setting time to zero disables the periodic flush and
+ * restores the default behavior.
+ *
+ * Periodic flush can return only those activity buffers which are full and have all the
+ * activity records completed.
+ *
+ * It's allowed to use the API \ref cuptiActivityFlushAll to flush the data on-demand, even
+ * when client sets the periodic flush.
+ *
+ * \param time flush period in milliseconds (ms)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ *
+ * \see cuptiActivityFlushAll
+ */
+CUptiResult CUPTIAPI cuptiActivityFlushPeriod(uint32_t time);
+
+/**
+ * \brief Controls the collection of launch attributes for kernels.
+ *
+ * This API is used to control the collection of launch attributes for kernels whose
+ * records are provided through the struct \ref CUpti_ActivityKernel9.
+ * Default value is 0, i.e. these attributes are not collected.
+ *
+ * \param enable is a boolean denoting whether these launch attributes should be collected
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableLaunchAttributes(uint8_t enable);
+
+/**
+ * \brief Function type for callback used by CUPTI to request a timestamp
+ * to be used in activity records.
+ *
+ * This callback function signals the CUPTI client that a timestamp needs
+ * to be returned. This timestamp would be treated as normalized timestamp
+ * to be used for various purposes in CUPTI. For example to store start and
+ * end timestamps reported in the CUPTI activity records.
+ * The returned timestamp must be in nanoseconds.
+ *
+ * \sa ::cuptiActivityRegisterTimestampCallback
+ */
+typedef uint64_t (CUPTIAPI *CUpti_TimestampCallbackFunc)(void);
+
+/**
+ * \brief Registers callback function with CUPTI for providing timestamp.
+ *
+ * This function registers a callback function to obtain timestamp of user's
+ * choice instead of using CUPTI provided timestamp.
+ * By default CUPTI uses different methods, based on the underlying platform,
+ * to retrieve the timestamp
+ * Linux and Android use clock_gettime(CLOCK_REALTIME, ..)
+ * Windows uses QueryPerformanceCounter()
+ * Mac uses mach_absolute_time()
+ * QNX uses ClockCycles()
+ * Timestamps retrieved using these methods are converted to nanosecond if needed
+ * before usage.
+ *
+ * The registration of timestamp callback should be done before any of the CUPTI
+ * activity kinds are enabled to make sure that all the records report the timestamp using
+ * the callback function registered through cuptiActivityRegisterTimestampCallback API.
+ *
+ * Changing the timestamp callback function in CUPTI through
+ * cuptiActivityRegisterTimestampCallback API in the middle of the profiling
+ * session can cause records generated prior to the change to report
+ * timestamps through previous timestamp method.
+ *
+ * \param funcTimestamp callback which is invoked when a timestamp is
+ * needed by CUPTI
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcTimestamp is NULL
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityRegisterTimestampCallback(CUpti_TimestampCallbackFunc funcTimestamp);
+
+/**
+ * \brief Controls the collection of records for device launched graphs.
+ *
+ * This API is used to control the collection of records for device launched graphs.
+ * Default value is 0, i.e. these records are not collected. This API needs
+ * to be called before initialization of CUDA and this setting should not be
+ * changed during the profiling session.
+ *
+ * \param enable is a boolean, denoting whether these records should be
+ * collected
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ */
+CUptiResult CUPTIAPI cuptiActivityEnableDeviceGraph(uint8_t enable);
+
+/** @} */ /* END CUPTI_ACTIVITY_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+// Including deprecated structures of CUPTI_ACTIVITY_API
+#include "cupti_activity_deprecated.h"
+
+#endif /*_CUPTI_ACTIVITY_H_*/
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h
new file mode 100644
index 0000000000000000000000000000000000000000..084ea84ed7be17af6d1634d772fd270fb5a0351f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_activity_deprecated.h
@@ -0,0 +1,4784 @@
+/*
+ * Copyright 2011-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_ACTIVITY_DEPRECATED_H_)
+#define _CUPTI_ACTIVITY_DEPRECATED_H_
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief The kinds of activity records.
+ *
+ * Each activity record kind represents information about a GPU or an
+ * activity occurring on a CPU or GPU. Each kind is associated with a
+ * activity record structure that holds the information associated
+ * with the kind.
+ * \see CUpti_ActivityOverhead
+ * \see CUpti_ActivityOverhead2
+ * \see CUpti_ActivityDevice
+ * \see CUpti_ActivityDevice2
+ * \see CUpti_ActivityDevice3
+ * \see CUpti_ActivityDevice4
+ * \see CUpti_ActivityKernel
+ * \see CUpti_ActivityKernel2
+ * \see CUpti_ActivityKernel3
+ * \see CUpti_ActivityKernel4
+ * \see CUpti_ActivityKernel5
+ * \see CUpti_ActivityKernel6
+ * \see CUpti_ActivityKernel7
+ * \see CUpti_ActivityKernel8
+ * \see CUpti_ActivityMemcpy
+ * \see CUpti_ActivityMemcpy3
+ * \see CUpti_ActivityMemcpy4
+ * \see CUpti_ActivityMemcpyPtoP
+ * \see CUpti_ActivityMemcpyPtoP2
+ * \see CUpti_ActivityMemcpyPtoP3
+ * \see CUpti_ActivityMemset
+ * \see CUpti_ActivityMemset2
+ * \see CUpti_ActivityMemset3
+ * \see CUpti_ActivityMemory2
+ * \see CUpti_ActivityMemoryPool
+ * \see CUpti_ActivityMarker
+ * \see CUpti_ActivityGlobalAccess
+ * \see CUpti_ActivityGlobalAccess2
+ * \see CUpti_ActivityBranch
+ * \see CUpti_ActivityPCSampling
+ * \see CUpti_ActivityPCSampling2
+ * \see CUpti_ActivityUnifiedMemoryCounter
+ * \see CUpti_ActivityNvLink
+ * \see CUpti_ActivityNvLink2
+ * \see CUpti_ActivityNvLink3
+ */
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ * (Deprecated in CUDA 12.2)
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD). These records are now reported using
+ * CUpti_ActivityOverhead3
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+} CUpti_ActivityOverhead;
+
+/**
+ * \brief The activity record for CUPTI and driver overheads.
+ *
+ * This activity record provides CUPTI and driver overhead information
+ * (CUPTI_ACTIVITY_OVERHEAD).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_OVERHEAD.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of overhead, CUPTI, DRIVER, COMPILER etc.
+   */
+  CUpti_ActivityOverheadKind overheadKind;
+
+  /**
+   * The kind of activity object that the overhead is associated with.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object. 'objectKind' indicates
+   * which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+  /**
+   * The start timestamp for the overhead, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the overhead.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the overhead, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the overhead.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the overhead operation to which
+   * records belong to. This ID is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the overhead operation.
+   * In some cases, it can be zero, such as for CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH records.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Reserved for internal use.
+   */
+  uint32_t reserved0;
+} CUpti_ActivityOverhead2;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice;
+
+/**
+ * \brief The activity record for a device. (deprecated)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+} CUpti_ActivityDevice2;
+
+/**
+ * \brief The activity record for a device. (CUDA 7.0 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  uint8_t reserved[7];
+} CUpti_ActivityDevice3;
+
+/**
+ * \brief The activity record for a device. (CUDA 11.6 onwards)
+ *
+ * This activity record represents information about a GPU device
+ * (CUPTI_ACTIVITY_KIND_DEVICE).
+ * Device activity is now reported using the
+ * CUpti_ActivityDevice5 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_DEVICE.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the device. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The global memory bandwidth available on the device, in
+   * kBytes/sec.
+   */
+  uint64_t globalMemoryBandwidth;
+
+  /**
+   * The amount of global memory on the device, in bytes.
+   */
+  uint64_t globalMemorySize;
+
+  /**
+   * The amount of constant memory on the device, in bytes.
+   */
+  uint32_t constantMemorySize;
+
+  /**
+   * The size of the L2 cache on the device, in bytes.
+   */
+  uint32_t l2CacheSize;
+
+  /**
+   * The number of threads per warp on the device.
+   */
+  uint32_t numThreadsPerWarp;
+
+  /**
+   * The core clock rate of the device, in kHz.
+   */
+  uint32_t coreClockRate;
+
+  /**
+   * Number of memory copy engines on the device.
+   */
+  uint32_t numMemcpyEngines;
+
+  /**
+   * Number of multiprocessors on the device.
+   */
+  uint32_t numMultiprocessors;
+
+  /**
+   * The maximum "instructions per cycle" possible on each device
+   * multiprocessor.
+   */
+  uint32_t maxIPC;
+
+  /**
+   * Maximum number of warps that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxWarpsPerMultiprocessor;
+
+  /**
+   * Maximum number of blocks that can be present on a multiprocessor
+   * at any given time.
+   */
+  uint32_t maxBlocksPerMultiprocessor;
+
+  /**
+   * Maximum amount of shared memory available per multiprocessor, in bytes.
+   */
+  uint32_t maxSharedMemoryPerMultiprocessor;
+
+  /**
+   * Maximum number of 32-bit registers available per multiprocessor.
+   */
+  uint32_t maxRegistersPerMultiprocessor;
+
+  /**
+   * Maximum number of registers that can be allocated to a block.
+   */
+  uint32_t maxRegistersPerBlock;
+
+  /**
+   * Maximum amount of shared memory that can be assigned to a block,
+   * in bytes.
+   */
+  uint32_t maxSharedMemoryPerBlock;
+
+  /**
+   * Maximum number of threads allowed in a block.
+   */
+  uint32_t maxThreadsPerBlock;
+
+  /**
+   * Maximum allowed X dimension for a block.
+   */
+  uint32_t maxBlockDimX;
+
+  /**
+   * Maximum allowed Y dimension for a block.
+   */
+  uint32_t maxBlockDimY;
+
+  /**
+   * Maximum allowed Z dimension for a block.
+   */
+  uint32_t maxBlockDimZ;
+
+  /**
+   * Maximum allowed X dimension for a grid.
+   */
+  uint32_t maxGridDimX;
+
+  /**
+   * Maximum allowed Y dimension for a grid.
+   */
+  uint32_t maxGridDimY;
+
+  /**
+   * Maximum allowed Z dimension for a grid.
+   */
+  uint32_t maxGridDimZ;
+
+  /**
+   * Compute capability for the device, major number.
+   */
+  uint32_t computeCapabilityMajor;
+
+  /**
+   * Compute capability for the device, minor number.
+   */
+  uint32_t computeCapabilityMinor;
+
+  /**
+   * The device ID.
+   */
+  uint32_t id;
+
+  /**
+   * ECC enabled flag for device
+   */
+  uint32_t eccEnabled;
+
+  /**
+   * The device UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid uuid;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The device name. This name is shared across all activity records
+   * representing instances of the device, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Flag to indicate whether the device is visible to CUDA. Users can
+   * set the device visibility using CUDA_VISIBLE_DEVICES environment
+   */
+  uint8_t isCudaVisible;
+
+  /**
+   * MIG enabled flag for device
+   */
+  uint8_t isMigEnabled;
+
+  uint8_t reserved[6];
+
+  /**
+   * GPU Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t gpuInstanceId;
+
+  /**
+   * Compute Instance id for MIG enabled devices.
+   * If mig mode is disabled value is set to UINT32_MAX
+   */
+  uint32_t computeInstanceId;
+
+  /**
+   * The MIG UUID. This value is the globally unique immutable
+   * alphanumeric identifier of the device.
+   */
+  CUuuid migUuid;
+
+} CUpti_ActivityDevice4;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL
+   * or CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The cache configuration requested by the kernel. The value is one
+   * of the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigRequested;
+
+  /**
+   * The cache configuration used for the kernel. The value is one of
+   * the CUfunc_cache enumeration values from cuda.h.
+   */
+  uint8_t cacheConfigExecuted;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the kernel. Each kernel execution
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t runtimeCorrelationId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel;
+
+/**
+ * \brief The activity record for kernel. (deprecated)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel2;
+
+/**
+ * \brief The activity record for a kernel (CUDA 6.5(with sm_52 support) onwards).
+ * (deprecated in CUDA 9.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityKernel3;
+
+/**
+ * \brief The activity record for a kernel (CUDA 9.0(with sm_70 support) onwards).
+ * (deprecated in CUDA 11.0)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL).
+ * Kernel activities are now reported using the CUpti_ActivityKernel9 activity
+ * record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+} CUpti_ActivityKernel4;
+
+/**
+ * \brief The activity record for a kernel (CUDA 11.0(with sm_80 support) onwards).
+ * (deprecated in CUDA 11.2)
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+} CUpti_ActivityKernel5;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+} CUpti_ActivityKernel6;
+
+/**
+ * \brief The activity record for kernel. (deprecated in CUDA 11.8)
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) but is no longer generated
+ * by CUPTI. Kernel activities are now reported using the
+ * CUpti_ActivityKernel9 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+} CUpti_ActivityKernel7;
+
+/**
+ * \brief The activity record for kernel.
+ *
+ * This activity record represents a kernel execution
+ * (CUPTI_ACTIVITY_KIND_KERNEL and
+ * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL)
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_KERNEL or
+   * CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * For devices with compute capability 7.0+ cacheConfig values are not updated
+   * in case field isSharedMemoryCarveoutRequested is set
+   */
+  union {
+    uint8_t both;
+    struct {
+      /**
+       * The cache configuration requested by the kernel. The value is one
+       * of the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t requested:4;
+
+      /**
+       * The cache configuration used for the kernel. The value is one of
+       * the CUfunc_cache enumeration values from cuda.h.
+       */
+      uint8_t executed:4;
+    } config;
+  } cacheConfig;
+
+  /**
+   * The shared memory configuration used for the kernel. The value is one of
+   * the CUsharedconfig enumeration values from cuda.h.
+   */
+  uint8_t sharedMemoryConfig;
+
+  /**
+   * The number of registers required for each thread executing the
+   * kernel.
+   */
+  uint16_t registersPerThread;
+
+  /**
+   * The partitioned global caching requested for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheRequested;
+
+  /**
+   * The partitioned global caching executed for the kernel. Partitioned
+   * global caching is required to enable caching on certain chips, such as
+   * devices with compute capability 5.2. Partitioned global caching can be
+   * automatically disabled if the occupancy requirement of the launch cannot
+   * support caching.
+   */
+  CUpti_ActivityPartitionedGlobalCacheConfig partitionedGlobalCacheExecuted;
+
+  /**
+   * The start timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the kernel execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the kernel.
+   */
+  uint64_t end;
+
+  /**
+   * The completed timestamp for the kernel execution, in ns.  It
+   * represents the completion of all it's child kernels and the
+   * kernel itself. A value of CUPTI_TIMESTAMP_UNKNOWN indicates that
+   * the completion time is unknown.
+   */
+  uint64_t completed;
+
+  /**
+   * The ID of the device where the kernel is executing.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the kernel is executing.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the kernel is executing.
+   */
+  uint32_t streamId;
+
+  /**
+   * The X-dimension grid size for the kernel.
+   */
+  int32_t gridX;
+
+  /**
+   * The Y-dimension grid size for the kernel.
+   */
+  int32_t gridY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t gridZ;
+
+  /**
+   * The X-dimension block size for the kernel.
+   */
+  int32_t blockX;
+
+  /**
+   * The Y-dimension block size for the kernel.
+   */
+  int32_t blockY;
+
+  /**
+   * The Z-dimension grid size for the kernel.
+   */
+  int32_t blockZ;
+
+  /**
+   * The static shared memory allocated for the kernel, in bytes.
+   */
+  int32_t staticSharedMemory;
+
+  /**
+   * The dynamic shared memory reserved for the kernel, in bytes.
+   */
+  int32_t dynamicSharedMemory;
+
+  /**
+   * The amount of local memory reserved for each thread, in bytes.
+   */
+  uint32_t localMemoryPerThread;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes (deprecated in CUDA 11.8).
+   * Refer field localMemoryTotal_v2
+   */
+  uint32_t localMemoryTotal;
+
+  /**
+   * The correlation ID of the kernel. Each kernel execution is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the kernel.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The grid ID of the kernel. Each kernel is assigned a unique
+   * grid ID at runtime.
+   */
+  int64_t gridId;
+
+  /**
+   * The name of the kernel. This name is shared across all activity
+   * records representing the same kernel, and so should not be
+   * modified.
+   */
+  const char *name;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The timestamp when the kernel is queued up in the command buffer, in ns.
+   * A value of CUPTI_TIMESTAMP_UNKNOWN indicates that the queued time
+   * could not be collected for the kernel. This timestamp is not collected
+   * by default. Use API \ref cuptiActivityEnableLatencyTimestamps() to
+   * enable collection.
+   *
+   * Command buffer is a buffer written by CUDA driver to send commands
+   * like kernel launch, memory copy etc to the GPU. All launches of CUDA
+   * kernels are asynchronous with respect to the host, the host requests
+   * the launch by writing commands into the command buffer, then returns
+   * without checking the GPU's progress.
+   */
+  uint64_t queued;
+
+  /**
+   * The timestamp when the command buffer containing the kernel launch
+   * is submitted to the GPU, in ns. A value of CUPTI_TIMESTAMP_UNKNOWN
+   * indicates that the submitted time could not be collected for the kernel.
+   * This timestamp is not collected by default. Use API \ref
+   * cuptiActivityEnableLatencyTimestamps() to enable collection.
+   */
+  uint64_t submitted;
+
+  /**
+   * The indicates if the kernel was executed via a regular launch or via a
+   * single/multi device cooperative launch. \see CUpti_ActivityLaunchType
+   */
+  uint8_t launchType;
+
+  /**
+   * This indicates if CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT was
+   * updated for the kernel launch
+   */
+  uint8_t isSharedMemoryCarveoutRequested;
+
+  /**
+   * Shared memory carveout value requested for the function in percentage of
+   * the total resource. The value will be updated only if field
+   * isSharedMemoryCarveoutRequested is set.
+   */
+  uint8_t sharedMemoryCarveoutRequested;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint8_t padding;
+
+ /**
+  * Shared memory size set by the driver.
+  */
+  uint32_t sharedMemoryExecuted;
+
+  /**
+   * The unique ID of the graph node that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The shared memory limit config for the kernel. This field shows whether user has opted for a
+   * higher per block limit of dynamic shared memory.
+   */
+  CUpti_FuncShmemLimitConfig shmemLimitConfig;
+
+  /**
+   * The unique ID of the graph that launched this kernel through graph launch APIs.
+   * This field will be 0 if the kernel is not launched through graph launch APIs.
+   */
+  uint32_t graphId;
+
+  /**
+   * The pointer to the access policy window. The structure CUaccessPolicyWindow is
+   * defined in cuda.h.
+   */
+  CUaccessPolicyWindow *pAccessPolicyWindow;
+
+  /**
+   * The ID of the HW channel on which the kernel is launched.
+   */
+  uint32_t channelID;
+
+  /**
+   * The type of the channel
+   */
+  CUpti_ChannelType channelType;
+
+  /**
+   * The X-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterX;
+
+  /**
+   * The Y-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterY;
+
+  /**
+   * The Z-dimension cluster size for the kernel.
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterZ;
+
+  /**
+   * The cluster scheduling policy for the kernel. Refer CUclusterSchedulingPolicy
+   * Field is valid for devices with compute capability 9.0 and higher
+   */
+  uint32_t clusterSchedulingPolicy;
+
+  /**
+   * The total amount of local memory reserved for the kernel, in
+   * bytes.
+   */
+  uint64_t localMemoryTotal_v2;
+} CUpti_ActivityKernel8;
+
+/**
+ * \brief The activity record for memory copies. (deprecated)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpy;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpy3;
+
+/**
+ * \brief The activity record for memory copies. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size. \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size. \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory copy is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the memory copy.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The runtime correlation ID of the memory copy. Each memory copy
+   * is assigned a unique runtime correlation ID that is identical to
+   * the correlation ID in the runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t runtimeCorrelationId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpy4;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2) but is no longer generated
+ * by CUPTI. Peer-to-peer memory copy activities are now reported using the
+ * CUpti_ActivityMemcpyPtoP2 activity record..
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemcpyPtoP;
+
+typedef CUpti_ActivityMemcpyPtoP CUpti_ActivityMemcpy2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemcpyPtoP2;
+
+/**
+ * \brief The activity record for peer-to-peer memory copies.
+ * (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a peer-to-peer memory copy
+ * (CUPTI_ACTIVITY_KIND_MEMCPY2).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMCPY2.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The kind of the memory copy, stored as a byte to reduce record
+   * size.  \see CUpti_ActivityMemcpyKind
+   */
+  uint8_t copyKind;
+
+  /**
+   * The source memory kind read by the memory copy, stored as a byte
+   * to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t srcKind;
+
+  /**
+   * The destination memory kind read by the memory copy, stored as a
+   * byte to reduce record size.  \see CUpti_ActivityMemoryKind
+   */
+  uint8_t dstKind;
+
+  /**
+   * The flags associated with the memory copy. \see
+   * CUpti_ActivityFlag
+   */
+  uint8_t flags;
+
+  /**
+   * The number of bytes transferred by the memory copy.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory copy, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory copy.
+   */
+  uint64_t end;
+
+  /**
+  * The ID of the device where the memory copy is occurring.
+  */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory copy is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory copy is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The ID of the device where memory is being copied from.
+   */
+  uint32_t srcDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied from.
+   */
+  uint32_t srcContextId;
+
+  /**
+   * The ID of the device where memory is being copied to.
+   */
+  uint32_t dstDeviceId;
+
+  /**
+   * The ID of the context owning the memory being copied to.
+   */
+  uint32_t dstContextId;
+
+  /**
+   * The correlation ID of the memory copy. Each memory copy is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory copy.
+   */
+  uint32_t correlationId;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed the memcpy through graph launch.
+   * This field will be 0 if memcpy is not done using graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memcpy through graph launch.
+   * This field will be 0 if the memcpy is not done through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemcpyPtoP3;
+
+/**
+ * \brief The activity record for memset. (deprecated)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+} CUpti_ActivityMemset;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.1)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+} CUpti_ActivityMemset2;
+
+/**
+ * \brief The activity record for memset. (deprecated in CUDA 11.6)
+ *
+ * This activity record represents a memory set operation
+ * (CUPTI_ACTIVITY_KIND_MEMSET).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMSET.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The value being assigned to memory by the memory set.
+   */
+  uint32_t value;
+
+  /**
+   * The number of bytes being set by the memory set.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the memory set, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the memory set.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the memory set is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context where the memory set is occurring.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the memory set is occurring.
+   */
+  uint32_t streamId;
+
+  /**
+   * The correlation ID of the memory set. Each memory set is assigned
+   * a unique correlation ID that is identical to the correlation ID
+   * in the driver API activity record that launched the memory set.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The flags associated with the memset. \see CUpti_ActivityFlag
+   */
+  uint16_t flags;
+
+  /**
+   * The memory kind of the memory set \see CUpti_ActivityMemoryKind
+   */
+  uint16_t memoryKind;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  void *reserved0;
+
+  /**
+   * The unique ID of the graph node that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint64_t graphNodeId;
+
+  /**
+   * The unique ID of the graph that executed this memset through graph launch.
+   * This field will be 0 if the memset is not executed through graph launch.
+   */
+  uint32_t graphId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t padding;
+} CUpti_ActivityMemset3;
+
+/**
+ * \brief The activity record for memory.
+ *
+ * This activity record represents a memory allocation and free operation
+ * (CUPTI_ACTIVITY_KIND_MEMORY2).
+ * This activity record provides separate records for memory allocation and
+ * memory release operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory operation.
+ *
+ * Note: This activity record is an upgrade over \ref CUpti_ActivityMemory
+ * enabled using the kind \ref CUPTI_ACTIVITY_KIND_MEMORY.
+ * \ref CUpti_ActivityMemory provides a single record for the memory
+ * allocation and memory release operations.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY2
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryOperationType.
+   */
+  CUpti_ActivityMemoryOperationType memoryOperationType;
+
+  /**
+   * The memory kind requested by the user, \ref CUpti_ActivityMemoryKind.
+   */
+  CUpti_ActivityMemoryKind memoryKind;
+
+  /**
+   * The correlation ID of the memory operation. Each memory operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The number of bytes of memory allocated.
+   */
+  uint64_t bytes;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The program counter of the memory operation.
+   */
+  uint64_t PC;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory operation is taking place.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The ID of the context. If context is NULL, \p contextId is set to CUPTI_INVALID_CONTEXT_ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream. If memory operation is not async, \p streamId is set to CUPTI_INVALID_STREAM_ID.
+   */
+  uint32_t streamId;
+
+  /**
+   * Variable name. This name is shared across all activity
+   * records representing the same symbol, and so should not be
+   * modified.
+   */
+  const char* name;
+
+  /**
+   * \p isAsync is set if memory operation happens through async memory APIs.
+   */
+  uint32_t isAsync;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad1;
+#endif
+
+  /**
+   * The memory pool configuration used for the memory operations.
+   */
+  struct {
+    /**
+     * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+     */
+    CUpti_ActivityMemoryPoolType memoryPoolType;
+
+#ifdef CUPTILP64
+    /**
+     * Undefined. Reserved for internal use.
+     */
+    uint32_t pad2;
+#endif
+
+    /**
+     * The base address of the memory pool.
+     */
+    uint64_t address;
+
+    /**
+     * The release threshold of the memory pool in bytes. \p releaseThreshold is
+     * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+     */
+    uint64_t releaseThreshold;
+
+   /**
+   * The size of the memory pool in bytes and the processID of the memory pool.
+   * \p size is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   * \p processId is valid if \p memoryPoolType is
+   * CUPTI_ACTIVITY_MEMORY_POOL_TYPE_IMPORTED, \ref CUpti_ActivityMemoryPoolType.
+   */
+   union {
+      uint64_t size;
+      uint64_t processId;
+    } pool;
+  } memoryPoolConfig;
+
+} CUpti_ActivityMemory2;
+
+/**
+ * \brief The activity record for memory pool.
+ *
+ * This activity record represents a memory pool creation, destruction and
+ * trimming (CUPTI_ACTIVITY_KIND_MEMORY_POOL).
+ * This activity record provides separate records for memory pool creation,
+ * destruction and trimming operations.
+ * This allows to correlate the corresponding driver and runtime API
+ * activity record with the memory pool operation.
+ *
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MEMORY_POOL
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The memory operation requested by the user, \ref CUpti_ActivityMemoryPoolOperationType.
+   */
+  CUpti_ActivityMemoryPoolOperationType memoryPoolOperationType;
+
+  /**
+   * The type of the memory pool, \ref CUpti_ActivityMemoryPoolType
+   */
+  CUpti_ActivityMemoryPoolType memoryPoolType;
+
+  /**
+   * The correlation ID of the memory pool operation. Each memory pool
+   * operation is assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver and runtime API activity record that
+   * launched the memory operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The ID of the process to which this record belongs to.
+   */
+  uint32_t processId;
+
+  /**
+   * The ID of the device where the memory pool is created.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The minimum bytes to keep of the memory pool. \p minBytesToKeep is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_OPERATION_TYPE_TRIMMED,
+   * \ref CUpti_ActivityMemoryPoolOperationType
+   */
+  size_t minBytesToKeep;
+
+#ifndef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The virtual address of the allocation.
+   */
+  uint64_t address;
+
+  /**
+   * The size of the memory pool operation in bytes. \p size is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t size;
+
+  /**
+   * The release threshold of the memory pool. \p releaseThreshold is
+   * valid for CUPTI_ACTIVITY_MEMORY_POOL_TYPE_LOCAL, \ref CUpti_ActivityMemoryPoolType.
+   */
+  uint64_t releaseThreshold;
+
+  /**
+   * The start timestamp for the memory operation, in ns.
+   */
+  uint64_t timestamp;
+} CUpti_ActivityMemoryPool;
+
+/**
+ * \brief The activity record providing a marker which is an
+ * instantaneous point in time. (deprecated in CUDA 8.0)
+ *
+ * The marker is specified with a descriptive name and unique id
+ * (CUPTI_ACTIVITY_KIND_MARKER).
+ * Marker activity is now reported using the
+ * CUpti_ActivityMarker2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_MARKER.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The flags associated with the marker. \see CUpti_ActivityFlag
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The timestamp for the marker, in ns. A value of 0 indicates that
+   * timestamp information could not be collected for the marker.
+   */
+  uint64_t timestamp;
+
+  /**
+   * The marker ID.
+   */
+  uint32_t id;
+
+  /**
+   * The kind of activity object associated with this marker.
+   */
+  CUpti_ActivityObjectKind objectKind;
+
+  /**
+   * The identifier for the activity object associated with this
+   * marker. 'objectKind' indicates which ID is valid for this record.
+   */
+  CUpti_ActivityObjectKindId objectId;
+
+#ifdef CUPTILP64
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+#endif
+
+  /**
+   * The marker name for an instantaneous or start marker. This will
+   * be NULL for an end marker.
+   */
+  const char *name;
+
+} CUpti_ActivityMarker;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+} CUpti_ActivityGlobalAccess;
+
+/**
+ * \brief The activity record for source-level global
+ * access. (deprecated in CUDA 9.0)
+ *
+ * This activity records the locations of the global
+ * accesses in the source (CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS).
+ * Global access activities are now reported using the
+ * CUpti_ActivityGlobalAccess3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GLOBAL_ACCESS.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this global access.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the access.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction with predicate and condition code evaluating to true.
+   */
+  uint64_t threadsExecuted;
+
+  /**
+   * The total number of 32 bytes transactions to L2 cache generated by this access
+   */
+  uint64_t l2_transactions;
+
+  /**
+   * The minimum number of L2 transactions possible based on the access pattern.
+   */
+  uint64_t theoreticalL2Transactions;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * when at least one of thread among warp is active with predicate and condition code
+   * evaluating to true.
+   */
+  uint32_t executed;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityGlobalAccess2;
+
+/**
+ * \brief The activity record for source level result
+ * branch. (deprecated)
+ *
+ * This activity record the locations of the branches in the
+ * source (CUPTI_ACTIVITY_KIND_BRANCH).
+ * Branch activities are now reported using the
+ * CUpti_ActivityBranch2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_BRANCH.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The pc offset for the branch.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * The number of times this instruction was executed per warp. It will be incremented
+   * regardless of predicate or condition code.
+   */
+  uint32_t executed;
+
+  /**
+   * Number of times this branch diverged
+   */
+  uint32_t diverged;
+
+  /**
+   * This increments each time when this instruction is executed by number
+   * of threads that executed this instruction
+   */
+  uint64_t threadsExecuted;
+} CUpti_ActivityBranch;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 8.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling2 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+} CUpti_ActivityPCSampling;
+
+/**
+ * \brief The activity record for PC sampling. (deprecated in CUDA 9.0)
+ *
+ * This activity records information obtained by sampling PC
+ * (CUPTI_ACTIVITY_KIND_PC_SAMPLING).
+ * PC sampling activities are now reported using the
+ * CUpti_ActivityPCSampling3 activity record.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_PC_SAMPLING.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The properties of this instruction.
+   */
+  CUpti_ActivityFlag flags;
+
+  /**
+   * The ID for source locator.
+   */
+  uint32_t sourceLocatorId;
+
+  /**
+   * The correlation ID of the kernel to which this result is associated.
+   */
+  uint32_t correlationId;
+
+  /**
+  * Correlation ID with global/device function name
+  */
+  uint32_t functionId;
+
+  /**
+   * The pc offset for the instruction.
+   */
+  uint32_t pcOffset;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * These samples indicate that no instruction was issued in that cycle from
+   * the warp scheduler from where the warp was sampled.
+   * Field is valid for devices with compute capability 6.0 and higher
+   */
+  uint32_t latencySamples;
+
+  /**
+   * Number of times the PC was sampled with the stallReason in the record.
+   * The same PC can be sampled with different stall reasons. The count includes
+   * latencySamples.
+   */
+  uint32_t samples;
+
+  /**
+   * Current stall reason. Includes one of the reasons from
+   * \ref CUpti_ActivityPCSamplingStallReason
+   */
+  CUpti_ActivityPCSamplingStallReason stallReason;
+
+  uint32_t pad;
+} CUpti_ActivityPCSampling2;
+
+/**
+ * \brief The activity record for Unified Memory counters (deprecated in CUDA 7.0)
+ *
+ * This activity record represents a Unified Memory counter
+ * (CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER).
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The Unified Memory counter kind. See \ref CUpti_ActivityUnifiedMemoryCounterKind
+   */
+  CUpti_ActivityUnifiedMemoryCounterKind counterKind;
+
+  /**
+   * Scope of the Unified Memory counter. See \ref CUpti_ActivityUnifiedMemoryCounterScope
+   */
+  CUpti_ActivityUnifiedMemoryCounterScope scope;
+
+  /**
+   * The ID of the device involved in the memory transfer operation.
+   * It is not relevant if the scope of the counter is global (all devices).
+   */
+  uint32_t deviceId;
+
+  /**
+   * Value of the counter
+   *
+   */
+  uint64_t value;
+
+  /**
+   * The timestamp when this sample was retrieved, in ns. A value of 0
+   * indicates that timestamp information could not be collected
+   */
+  uint64_t timestamp;
+
+  /**
+   * The ID of the process to which this record belongs to. In case of
+   * global scope, processId is undefined.
+   */
+  uint32_t processId;
+
+  /**
+   * Undefined. Reserved for internal use.
+   */
+  uint32_t pad;
+} CUpti_ActivityUnifiedMemoryCounter;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 9.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NVLink information are now reported using the
+* CUpti_ActivityNvLink2 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+  * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+  */
+  CUpti_ActivityKind kind;
+
+  /**
+  * NVLink version.
+  */
+  uint32_t nvlinkVersion;
+
+  /**
+  * Type of device 0 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev0;
+
+  /**
+  * Type of device 1 \ref CUpti_DevType
+  */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+      * Index of the NPU. First index will always be zero.
+      */
+      uint32_t index;
+
+      /**
+      * Domain ID of NPU. On Linux, this can be queried using lspci.
+      */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+  * Flag gives capabilities of the link \see CUpti_LinkFlag
+  */
+  uint32_t flag;
+
+  /**
+  * Number of physical NVLinks present between two devices.
+  */
+  uint32_t physicalNvLinkCount;
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 0.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev0[4];
+
+  /**
+  * Port numbers for maximum 4 NVLinks connected to device 1.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+  * In case of invalid/unknown port number, this field will be set
+  * to value CUPTI_NVLINK_INVALID_PORT.
+  * This will be used to correlate the metric values to individual
+  * physical link and attribute traffic to the logical NVLink in
+  * the topology.
+  */
+  int8_t portDev1[4];
+
+  /**
+  * Bandwidth of NVLink in kbytes/sec
+  */
+  uint64_t bandwidth;
+} CUpti_ActivityNvLink;
+
+/**
+* \brief NVLink information. (deprecated in CUDA 10.0)
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t  bandwidth;
+} CUpti_ActivityNvLink2;
+
+/**
+* \brief NVLink information.
+*
+* This structure gives capabilities of each logical NVLink connection between two devices,
+* gpu<->gpu or gpu<->CPU which can be used to understand the topology.
+* NvLink information are now reported using the
+* CUpti_ActivityNvLink4 activity record.
+*/
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_NVLINK.
+   */
+  CUpti_ActivityKind kind;
+  /**
+   * NvLink version.
+   */
+  uint32_t nvlinkVersion;
+
+  /**
+   * Type of device 0 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev0;
+
+  /**
+   * Type of device 1 \ref CUpti_DevType
+   */
+  CUpti_DevType typeDev1;
+
+  /**
+  * If typeDev0 is CUPTI_DEV_TYPE_GPU, UUID for device 0. \ref CUpti_ActivityDevice5.
+  * If typeDev0 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev0;
+
+  /**
+  * If typeDev1 is CUPTI_DEV_TYPE_GPU, UUID for device 1. \ref CUpti_ActivityDevice5.
+  * If typeDev1 is CUPTI_DEV_TYPE_NPU, struct npu for NPU.
+  */
+  union {
+    CUuuid uuidDev;
+    struct {
+      /**
+       * Index of the NPU. First index will always be zero.
+       */
+      uint32_t index;
+
+      /**
+       * Domain ID of NPU. On Linux, this can be queried using lspci.
+       */
+      uint32_t domainId;
+    } npu;
+  } idDev1;
+
+  /**
+   * Flag gives capabilities of the link \see CUpti_LinkFlag
+   */
+  uint32_t flag;
+
+  /**
+   * Number of physical NVLinks present between two devices.
+   */
+  uint32_t physicalNvLinkCount;
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 0.
+   * If typeDev0 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev0[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Port numbers for maximum 16 NVLinks connected to device 1.
+   * If typeDev1 is CUPTI_DEV_TYPE_NPU, ignore this field.
+   * In case of invalid/unknown port number, this field will be set
+   * to value CUPTI_NVLINK_INVALID_PORT.
+   * This will be used to correlate the metric values to individual
+   * physical link and attribute traffic to the logical NVLink in
+   * the topology.
+   */
+  int8_t portDev1[CUPTI_MAX_NVLINK_PORTS];
+
+  /**
+   * Bandwidth of NVLink in kbytes/sec
+   */
+  uint64_t bandwidth;
+
+  /**
+   * NVSwitch is connected as an intermediate node.
+   */
+  uint8_t nvswitchConnected;
+
+  /**
+   * Undefined. reserved for internal use
+   */
+  uint8_t pad[7];
+} CUpti_ActivityNvLink3;
+
+/**
+ * \brief The activity record for trace of graph execution.
+ *
+ * This activity record represents execution for a graph without giving visibility
+ * about the execution of its nodes. This is intended to reduce overheads in tracing
+ * each node. The activity kind is CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+ * Graph trace activity is now reported using CUpti_ActivityGraphTrace2 record.
+ */
+typedef struct {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_GRAPH_TRACE
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The correlation ID of the graph launch. Each graph launch is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver API activity record that launched
+   * the graph.
+   */
+  uint32_t correlationId;
+
+  /**
+   * The start timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the graph execution, in ns. A value of 0
+   * for both the start and end timestamps indicates that timestamp
+   * information could not be collected for the graph.
+   */
+  uint64_t end;
+
+  /**
+   * The ID of the device where the graph execution is occurring.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The unique ID of the graph that is launched.
+   */
+  uint32_t graphId;
+
+  /**
+   * The ID of the context where the graph is being launched.
+   */
+  uint32_t contextId;
+
+  /**
+   * The ID of the stream where the graph is being launched.
+   */
+  uint32_t streamId;
+
+  /**
+   * This field is reserved for internal use
+   */
+  void *reserved;
+} CUpti_ActivityGraphTrace;
+
+/**
+ * \brief The activity record for a context.
+ *
+ * This activity record represents information about a context
+ * (CUPTI_ACTIVITY_KIND_CONTEXT).
+ * Context activity is now reported using CUpti_ActivityContext2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind, must be CUPTI_ACTIVITY_KIND_CONTEXT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+   * The context ID.
+   */
+  uint32_t contextId;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The compute API kind. \see CUpti_ActivityComputeApiKind
+   */
+  uint16_t computeApiKind;
+
+  /**
+   * The ID for the NULL stream in this context
+   */
+  uint16_t nullStreamId;
+} CUpti_ActivityContext;
+
+/**
+ * \brief The activity record for JIT operations.
+ * This activity represents the JIT operations (compile, load, store) of a CUmodule
+ * from the Compute Cache.
+ * Gives the exact hashed path of where the cached module is loaded from,
+ * or where the module will be stored after Just-In-Time (JIT) compilation.
+ *
+ * JIT activity is now reported using CUpti_ActivityJit2 record
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * The activity record kind must be CUPTI_ACTIVITY_KIND_JIT.
+   */
+  CUpti_ActivityKind kind;
+
+  /**
+    * The JIT entry type.
+    */
+  CUpti_ActivityJitEntryType jitEntryType;
+
+  /**
+   * The JIT operation type.
+   */
+  CUpti_ActivityJitOperationType jitOperationType;
+
+  /**
+   * The device ID.
+   */
+  uint32_t deviceId;
+
+  /**
+   * The start timestamp for the JIT operation, in ns. A value of 0 for
+   * both the start and end timestamps indicates that timestamp
+   * information could not be collected for the JIT operation.
+   */
+  uint64_t start;
+
+  /**
+   * The end timestamp for the JIT operation, in ns. A value of 0 for both
+   * the start and end timestamps indicates that timestamp information
+   * could not be collected for the JIT operation.
+   */
+  uint64_t end;
+
+  /**
+   * The correlation ID of the JIT operation to which
+   * records belong to. Each JIT operation is
+   * assigned a unique correlation ID that is identical to the
+   * correlation ID in the driver or runtime API activity record that
+   * launched the JIT operation.
+   */
+  uint32_t correlationId;
+
+  /**
+   * Internal use.
+   */
+  uint32_t padding;
+
+  /**
+   * The correlation ID to correlate JIT compilation, load and store operations.
+   * Each JIT compilation unit is assigned a unique correlation ID
+   * at the time of the JIT compilation. This correlation id can be used
+   * to find the matching JIT cache load/store records.
+   */
+  uint64_t jitOperationCorrelationId;
+
+  /**
+   * The size of compute cache.
+   */
+  uint64_t cacheSize;
+
+  /**
+   * The path where the fat binary is cached.
+   */
+  const char* cachePath;
+} CUpti_ActivityJit;
+
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_ACTIVITY_DEPRECATED_H_*/
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_callbacks.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_callbacks.h
new file mode 100644
index 0000000000000000000000000000000000000000..afc6939244ef7a06c598c1d5750f638305a31501
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_callbacks.h
@@ -0,0 +1,860 @@
+/*
+ * Copyright 2010-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUPTI_CALLBACKS_H__)
+#define __CUPTI_CALLBACKS_H__
+
+#include <cuda.h>
+#include <builtin_types.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_CALLBACK_API CUPTI Callback API
+ * Functions, types, and enums that implement the CUPTI Callback API.
+ * @{
+ */
+
+/**
+ * \brief Specifies the point in an API call that a callback is issued.
+ *
+ * Specifies the point in an API call that a callback is issued. This
+ * value is communicated to the callback function via \ref
+ * CUpti_CallbackData::callbackSite.
+ */
+typedef enum {
+  /**
+   * The callback is at the entry of the API call.
+   */
+  CUPTI_API_ENTER                 = 0,
+  /**
+   * The callback is at the exit of the API call.
+   */
+  CUPTI_API_EXIT                  = 1,
+  CUPTI_API_CBSITE_FORCE_INT     = 0x7fffffff
+} CUpti_ApiCallbackSite;
+
+/**
+ * \brief Callback domains.
+ *
+ * Callback domains. Each domain represents callback points for a
+ * group of related API functions or CUDA driver activity.
+ */
+typedef enum {
+  /**
+   * Invalid domain.
+   */
+  CUPTI_CB_DOMAIN_INVALID           = 0,
+  /**
+   * Domain containing callback points for all driver API functions.
+   */
+  CUPTI_CB_DOMAIN_DRIVER_API        = 1,
+  /**
+   * Domain containing callback points for all runtime API
+   * functions.
+   */
+  CUPTI_CB_DOMAIN_RUNTIME_API       = 2,
+  /**
+   * Domain containing callback points for CUDA resource tracking.
+   */
+  CUPTI_CB_DOMAIN_RESOURCE          = 3,
+  /**
+   * Domain containing callback points for CUDA synchronization.
+   */
+  CUPTI_CB_DOMAIN_SYNCHRONIZE       = 4,
+  /**
+   * Domain containing callback points for NVTX API functions.
+   */
+  CUPTI_CB_DOMAIN_NVTX              = 5,
+  /**
+   * Domain containing callback points for various states.
+   */
+  CUPTI_CB_DOMAIN_STATE,
+  CUPTI_CB_DOMAIN_SIZE,
+
+  CUPTI_CB_DOMAIN_FORCE_INT         = 0x7fffffff
+} CUpti_CallbackDomain;
+
+/**
+ * \brief Callback IDs for resource domain.
+ *
+ * Callback IDs for resource domain, CUPTI_CB_DOMAIN_RESOURCE.  This
+ * value is communicated to the callback function via the \p cbid
+ * parameter.
+ */
+typedef enum {
+  /**
+   * Invalid resource callback ID.
+   */
+  CUPTI_CBID_RESOURCE_INVALID                               = 0,
+  /**
+   * A new context has been created.
+   */
+  CUPTI_CBID_RESOURCE_CONTEXT_CREATED                       = 1,
+  /**
+   * A context is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING              = 2,
+  /**
+   * A new stream has been created.
+   */
+  CUPTI_CBID_RESOURCE_STREAM_CREATED                        = 3,
+  /**
+   * A stream is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING               = 4,
+  /**
+   * The driver has finished initializing.
+   */
+  CUPTI_CBID_RESOURCE_CU_INIT_FINISHED                      = 5,
+  /**
+   * A module has been loaded.
+   */
+  CUPTI_CBID_RESOURCE_MODULE_LOADED                         = 6,
+  /**
+   * A module is about to be unloaded.
+   */
+  CUPTI_CBID_RESOURCE_MODULE_UNLOAD_STARTING                = 7,
+  /**
+   * The current module which is being profiled.
+   */
+  CUPTI_CBID_RESOURCE_MODULE_PROFILED                       = 8,
+  /**
+   * CUDA graph has been created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPH_CREATED                         = 9,
+  /**
+   * CUDA graph is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPH_DESTROY_STARTING                = 10,
+  /**
+   * CUDA graph is cloned.
+   */
+  CUPTI_CBID_RESOURCE_GRAPH_CLONED                          = 11,
+  /**
+   * CUDA graph node is about to be created
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_CREATE_STARTING             = 12,
+  /**
+   * CUDA graph node is created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_CREATED                     = 13,
+  /**
+   * CUDA graph node is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_DESTROY_STARTING            = 14,
+  /**
+   * Dependency on a CUDA graph node is created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_DEPENDENCY_CREATED          = 15,
+  /**
+   * Dependency on a CUDA graph node is destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_DEPENDENCY_DESTROY_STARTING = 16,
+  /**
+   * An executable CUDA graph is about to be created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATE_STARTING             = 17,
+  /**
+   * An executable CUDA graph is created.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHEXEC_CREATED                     = 18,
+  /**
+   * An executable CUDA graph is about to be destroyed.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHEXEC_DESTROY_STARTING            = 19,
+  /**
+   * CUDA graph node is cloned.
+   */
+  CUPTI_CBID_RESOURCE_GRAPHNODE_CLONED                      = 20,
+  /**
+   * CUDA stream attribute is changed.
+   */
+  CUPTI_CBID_RESOURCE_STREAM_ATTRIBUTE_CHANGED              = 21,
+
+  CUPTI_CBID_RESOURCE_SIZE,
+  CUPTI_CBID_RESOURCE_FORCE_INT                   = 0x7fffffff
+} CUpti_CallbackIdResource;
+
+/**
+ * \brief Callback IDs for synchronization domain.
+ *
+ * Callback IDs for synchronization domain,
+ * CUPTI_CB_DOMAIN_SYNCHRONIZE.  This value is communicated to the
+ * callback function via the \p cbid parameter.
+ */
+typedef enum {
+  /**
+   * Invalid synchronize callback ID.
+   */
+  CUPTI_CBID_SYNCHRONIZE_INVALID                  = 0,
+  /**
+   * Stream synchronization has completed for the stream.
+   */
+  CUPTI_CBID_SYNCHRONIZE_STREAM_SYNCHRONIZED      = 1,
+  /**
+   * Context synchronization has completed for the context.
+   */
+  CUPTI_CBID_SYNCHRONIZE_CONTEXT_SYNCHRONIZED     = 2,
+  CUPTI_CBID_SYNCHRONIZE_SIZE,
+  CUPTI_CBID_SYNCHRONIZE_FORCE_INT                = 0x7fffffff
+} CUpti_CallbackIdSync;
+
+/**
+ * \brief Callback IDs for state domain.
+ *
+ * Callback IDs for state domain,
+ * CUPTI_CB_DOMAIN_STATE. This value is communicated to the
+ * callback function via the \p cbid parameter.
+ */
+typedef enum {
+  /**
+   * Invalid state callback ID.
+   */
+  CUPTI_CBID_STATE_INVALID                        = 0,
+  /**
+   * Notification of fatal errors - high impact, non-recoverable
+   * When encountered, CUPTI automatically invokes cuptiFinalize()
+   * User can control behavior of the application in future from 
+   * receiving this callback - such as continuing without profiling, or
+   * terminating the whole application.
+   */
+  CUPTI_CBID_STATE_FATAL_ERROR                    = 1,
+  /**
+   * Notification of non fatal errors - high impact, but recoverable
+   * This notification is not issued in the current release.
+   */
+  CUPTI_CBID_STATE_ERROR                          = 2,
+  /**
+   * Notification of warnings - low impact, recoverable
+   * This notification is not issued in the current release.
+   */
+  CUPTI_CBID_STATE_WARNING                        = 3,
+
+  CUPTI_CBID_STATE_SIZE,
+  CUPTI_CBID_STATE_FORCE_INT         = 0x7fffffff
+} CUpti_CallbackIdState;
+
+/**
+ * \brief Data passed into a runtime or driver API callback function.
+ *
+ * Data passed into a runtime or driver API callback function as the
+ * \p cbdata argument to \ref CUpti_CallbackFunc. The \p cbdata will
+ * be this type for \p domain equal to CUPTI_CB_DOMAIN_DRIVER_API or
+ * CUPTI_CB_DOMAIN_RUNTIME_API. The callback data is valid only within
+ * the invocation of the callback function that is passed the data. If
+ * you need to retain some data for use outside of the callback, you
+ * must make a copy of that data. For example, if you make a shallow
+ * copy of CUpti_CallbackData within a callback, you cannot
+ * dereference \p functionParams outside of that callback to access
+ * the function parameters. \p functionName is an exception: the
+ * string pointed to by \p functionName is a global constant and so
+ * may be accessed outside of the callback.
+ */
+typedef struct {
+  /**
+   * Point in the runtime or driver function from where the callback
+   * was issued.
+   */
+  CUpti_ApiCallbackSite callbackSite;
+
+  /**
+   * Name of the runtime or driver API function which issued the
+   * callback. This string is a global constant and so may be
+   * accessed outside of the callback.
+   */
+  const char *functionName;
+
+  /**
+   * Pointer to the arguments passed to the runtime or driver API
+   * call. See generated_cuda_runtime_api_meta.h and
+   * generated_cuda_meta.h for structure definitions for the
+   * parameters for each runtime and driver API function.
+   */
+  const void *functionParams;
+
+  /**
+   * Pointer to the return value of the runtime or driver API
+   * call. This field is only valid within the exit::CUPTI_API_EXIT
+   * callback. For a runtime API \p functionReturnValue points to a
+   * \p cudaError_t. For a driver API \p functionReturnValue points
+   * to a \p CUresult.
+   */
+  void *functionReturnValue;
+
+  /**
+   * Name of the symbol operated on by the runtime or driver API
+   * function which issued the callback. This entry is valid only for
+   * driver and runtime launch callbacks, where it returns the name of
+   * the kernel.
+   */
+  const char *symbolName;
+
+  /**
+   * Driver context current to the thread, or null if no context is
+   * current. This value can change from the entry to exit callback
+   * of a runtime API function if the runtime initializes a context.
+   */
+  CUcontext context;
+
+  /**
+   * Unique ID for the CUDA context associated with the thread. The
+   * UIDs are assigned sequentially as contexts are created and are
+   * unique within a process.
+   */
+  uint32_t contextUid;
+
+  /**
+   * Pointer to data shared between the entry and exit callbacks of
+   * a given runtime or drive API function invocation. This field
+   * can be used to pass 64-bit values from the entry callback to
+   * the corresponding exit callback.
+   */
+  uint64_t *correlationData;
+
+  /**
+   * The activity record correlation ID for this callback. For a
+   * driver domain callback (i.e. \p domain
+   * CUPTI_CB_DOMAIN_DRIVER_API) this ID will equal the correlation ID
+   * in the CUpti_ActivityAPI record corresponding to the CUDA driver
+   * function call. For a runtime domain callback (i.e. \p domain
+   * CUPTI_CB_DOMAIN_RUNTIME_API) this ID will equal the correlation
+   * ID in the CUpti_ActivityAPI record corresponding to the CUDA
+   * runtime function call. Within the callback, this ID can be
+   * recorded to correlate user data with the activity record. This
+   * field is new in 4.1.
+   */
+  uint32_t correlationId;
+
+} CUpti_CallbackData;
+
+/**
+ * \brief Data passed into a resource callback function.
+ *
+ * Data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The callback
+ * data is valid only within the invocation of the callback function
+ * that is passed the data. If you need to retain some data for use
+ * outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * For CUPTI_CBID_RESOURCE_CONTEXT_CREATED and
+   * CUPTI_CBID_RESOURCE_CONTEXT_DESTROY_STARTING, the context being
+   * created or destroyed. For CUPTI_CBID_RESOURCE_STREAM_CREATED and
+   * CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING, the context
+   * containing the stream being created or destroyed.
+   */
+  CUcontext context;
+
+  union {
+    /**
+     * For CUPTI_CBID_RESOURCE_STREAM_CREATED and
+     * CUPTI_CBID_RESOURCE_STREAM_DESTROY_STARTING, the stream being
+     * created or destroyed.
+     */
+    CUstream stream;
+  } resourceHandle;
+
+  /**
+   * Reserved for future use.
+   */
+  void *resourceDescriptor;
+} CUpti_ResourceData;
+
+
+/**
+ * \brief Module data passed into a resource callback function.
+ *
+ * CUDA module data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The module
+ * data is valid only within the invocation of the callback function
+ * that is passed the data. If you need to retain some data for use
+ * outside of the callback, you must make a copy of that data.
+ */
+
+typedef struct {
+  /**
+   * Identifier to associate with the CUDA module.
+   */
+    uint32_t moduleId;
+
+  /**
+   * The size of the cubin.
+   */
+    size_t cubinSize;
+
+  /**
+   * Pointer to the associated cubin.
+   */
+    const char *pCubin;
+} CUpti_ModuleResourceData;
+
+/**
+ * \brief CUDA graphs data passed into a resource callback function.
+ *
+ * CUDA graphs data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The graph
+ * data is valid only within the invocation of the callback function
+ * that is passed the data. If you need to retain some data for use
+ * outside of the callback, you must make a copy of that data.
+ */
+
+typedef struct {
+  /**
+   * CUDA graph
+   */
+    CUgraph graph;
+  /**
+   * The original CUDA graph from which \param graph is cloned
+   */
+    CUgraph originalGraph;
+  /**
+   * CUDA graph node
+   */
+    CUgraphNode node;
+  /**
+   * The original CUDA graph node from which \param node is cloned
+   */
+    CUgraphNode originalNode;
+  /**
+   * Type of the \param node
+   */
+    CUgraphNodeType nodeType;
+  /**
+   * The dependent graph node
+   * The size of the array is \param numDependencies.
+   */
+    CUgraphNode dependency;
+  /**
+   * CUDA executable graph
+   */
+    CUgraphExec graphExec;
+} CUpti_GraphData;
+
+/**
+ * \brief Data passed into a synchronize callback function.
+ *
+ * Data passed into a synchronize callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_SYNCHRONIZE. The
+ * callback data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * The context of the stream being synchronized.
+   */
+  CUcontext context;
+  /**
+   * The stream being synchronized.
+   */
+  CUstream  stream;
+} CUpti_SynchronizeData;
+
+/**
+ * \brief Data passed into a NVTX callback function.
+ *
+ * Data passed into a NVTX callback function as the \p cbdata argument
+ * to \ref CUpti_CallbackFunc. The \p cbdata will be this type for \p
+ * domain equal to CUPTI_CB_DOMAIN_NVTX. Unless otherwise notes, the
+ * callback data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * Name of the NVTX API function which issued the callback. This
+   * string is a global constant and so may be accessed outside of the
+   * callback.
+   */
+  const char *functionName;
+
+  /**
+   * Pointer to the arguments passed to the NVTX API call. See
+   * generated_nvtx_meta.h for structure definitions for the
+   * parameters for each NVTX API function.
+   */
+  const void *functionParams;
+
+  /**
+   * Pointer to the return value of the NVTX API call. See
+   * nvToolsExt.h for each NVTX API function's return value.
+   */
+  const void *functionReturnValue;
+} CUpti_NvtxData;
+
+/**
+ * \brief Stream attribute data passed into a resource callback function
+ * for CUPTI_CBID_RESOURCE_STREAM_ATTRIBUTE_CHANGED callback
+
+ * Data passed into a resource callback function as the \p cbdata
+ * argument to \ref CUpti_CallbackFunc. The \p cbdata will be this
+ * type for \p domain equal to CUPTI_CB_DOMAIN_RESOURCE. The
+ * stream attribute data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  /**
+   * The CUDA stream handle for the attribute
+   */
+  CUstream stream;
+
+  /**
+   * The type of the CUDA stream attribute
+   */
+  CUstreamAttrID attr;
+
+  /**
+   * The value of the CUDA stream attribute
+   */
+  const CUstreamAttrValue *value;
+} CUpti_StreamAttrData;
+
+/**
+ * \brief Data passed into a State callback function.
+ *
+ * Data passed into a State callback function as the \p cbdata argument
+ * to \ref CUpti_CallbackFunc. The \p cbdata will be this type for \p
+ * domain equal to CUPTI_CB_DOMAIN_STATE and callback Ids belonging to CUpti_CallbackIdState. 
+ * Unless otherwise noted, the callback data is valid only within the invocation of the callback
+ * function that is passed the data. If you need to retain some data
+ * for use outside of the callback, you must make a copy of that data.
+ */
+typedef struct {
+  union {
+    /**
+     * Data passed along with the callback Ids 
+     * Enum CUpti_CallbackIdState used to denote callback ids
+     */
+    struct {
+      /**
+       * Error code
+       */
+      CUptiResult result;
+      /**
+       * String containing more details. It can be NULL.
+       */
+      const char *message;
+    } notification;
+  };
+} CUpti_StateData;
+/**
+ * \brief An ID for a driver API, runtime API, resource or
+ * synchronization callback.
+ *
+ * An ID for a driver API, runtime API, resource or synchronization
+ * callback. Within a driver API callback this should be interpreted
+ * as a CUpti_driver_api_trace_cbid value (these values are defined in
+ * cupti_driver_cbid.h). Within a runtime API callback this should be
+ * interpreted as a CUpti_runtime_api_trace_cbid value (these values
+ * are defined in cupti_runtime_cbid.h). Within a resource API
+ * callback this should be interpreted as a \ref
+ * CUpti_CallbackIdResource value. Within a synchronize API callback
+ * this should be interpreted as a \ref CUpti_CallbackIdSync value.
+ */
+typedef uint32_t CUpti_CallbackId;
+
+/**
+ * \brief Function type for a callback.
+ *
+ * Function type for a callback. The type of the data passed to the
+ * callback in \p cbdata depends on the \p domain. If \p domain is
+ * CUPTI_CB_DOMAIN_DRIVER_API or CUPTI_CB_DOMAIN_RUNTIME_API the type
+ * of \p cbdata will be CUpti_CallbackData. If \p domain is
+ * CUPTI_CB_DOMAIN_RESOURCE the type of \p cbdata will be
+ * CUpti_ResourceData. If \p domain is CUPTI_CB_DOMAIN_SYNCHRONIZE the
+ * type of \p cbdata will be CUpti_SynchronizeData. If \p domain is
+ * CUPTI_CB_DOMAIN_NVTX the type of \p cbdata will be CUpti_NvtxData.
+ *
+ * \param userdata User data supplied at subscription of the callback
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ * \param cbdata Data passed to the callback.
+ */
+typedef void (CUPTIAPI *CUpti_CallbackFunc)(
+    void *userdata,
+    CUpti_CallbackDomain domain,
+    CUpti_CallbackId cbid,
+    const void *cbdata);
+
+/**
+ * \brief A callback subscriber.
+ */
+typedef struct CUpti_Subscriber_st *CUpti_SubscriberHandle;
+
+/**
+ * \brief Pointer to an array of callback domains.
+ */
+typedef CUpti_CallbackDomain *CUpti_DomainTable;
+
+/**
+ * \brief Get the available callback domains.
+ *
+ * Returns in \p *domainTable an array of size \p *domainCount of all
+ * the available callback domains.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param domainCount Returns number of callback domains
+ * \param domainTable Returns pointer to array of available callback domains
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialize CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p domainCount or \p domainTable are NULL
+ */
+CUptiResult CUPTIAPI cuptiSupportedDomains(size_t *domainCount,
+                                           CUpti_DomainTable *domainTable);
+
+/**
+ * \brief Initialize a callback subscriber with a callback function
+ * and user data.
+ *
+ * Initializes a callback subscriber with a callback function and
+ * (optionally) a pointer to user data. The returned subscriber handle
+ * can be used to enable and disable the callback for specific domains
+ * and callback IDs.
+ * \note Only a single subscriber can be registered at a time. To ensure
+ * that no other CUPTI client interrupts the profiling session, it's the
+ * responsibility of all the CUPTI clients to call this function before
+ * starting the profling session. In case profiling session is already
+ * started by another CUPTI client, this function returns the error code
+ * CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED.
+ * Note that this function returns the same error when application is
+ * launched using NVIDIA tools like nvprof, Visual Profiler, Nsight Systems,
+ * Nsight Compute, cuda-gdb and cuda-memcheck.
+ * \note This function does not enable any callbacks.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param subscriber Returns handle to initialize subscriber
+ * \param callback The callback function
+ * \param userdata A pointer to user data. This data will be passed to
+ * the callback function via the \p userdata parameter.
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialize CUPTI
+ * \retval CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED if there is already a CUPTI subscriber
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber is NULL
+ */
+CUptiResult CUPTIAPI cuptiSubscribe(CUpti_SubscriberHandle *subscriber,
+                                    CUpti_CallbackFunc callback,
+                                    void *userdata);
+
+/**
+ * \brief Unregister a callback subscriber.
+ *
+ * Removes a callback subscriber so that no future callbacks will be
+ * issued to that subscriber.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param subscriber Handle to the initialize subscriber
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber is NULL or not initialized
+ */
+CUptiResult CUPTIAPI cuptiUnsubscribe(CUpti_SubscriberHandle subscriber);
+
+/**
+ * \brief Get the current enabled/disabled state of a callback for a specific
+ * domain and function ID.
+ *
+ * Returns non-zero in \p *enable if the callback for a domain and
+ * callback ID is enabled, and zero if not enabled.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackState(sub,
+ * d, c) and cuptiEnableCallback(sub, d, c) are called concurrently,
+ * the results are undefined.
+ *
+ * \param enable Returns non-zero if callback enabled, zero if not enabled
+ * \param subscriber Handle to the initialize subscriber
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p enabled is NULL, or if \p
+ * subscriber, \p domain or \p cbid is invalid.
+ */
+CUptiResult CUPTIAPI cuptiGetCallbackState(uint32_t *enable,
+                                           CUpti_SubscriberHandle subscriber,
+                                           CUpti_CallbackDomain domain,
+                                           CUpti_CallbackId cbid);
+
+/**
+ * \brief Enable or disabled callbacks for a specific domain and
+ * callback ID.
+ *
+ * Enable or disabled callbacks for a subscriber for a specific domain
+ * and callback ID.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackState(sub,
+ * d, c) and cuptiEnableCallback(sub, d, c) are called concurrently,
+ * the results are undefined.
+ *
+ * \param enable New enable state for the callback. Zero disables the
+ * callback, non-zero enables the callback.
+ * \param subscriber - Handle to callback subscription
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber, \p domain or \p
+ * cbid is invalid.
+ */
+CUptiResult CUPTIAPI cuptiEnableCallback(uint32_t enable,
+                                         CUpti_SubscriberHandle subscriber,
+                                         CUpti_CallbackDomain domain,
+                                         CUpti_CallbackId cbid);
+
+/**
+ * \brief Enable or disabled all callbacks for a specific domain.
+ *
+ * Enable or disabled all callbacks for a specific domain.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackEnabled(sub,
+ * d, *) and cuptiEnableDomain(sub, d) are called concurrently, the
+ * results are undefined.
+ *
+ * \param enable New enable state for all callbacks in the
+ * domain. Zero disables all callbacks, non-zero enables all
+ * callbacks.
+ * \param subscriber - Handle to callback subscription
+ * \param domain The domain of the callback
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber or \p domain is invalid
+ */
+CUptiResult CUPTIAPI cuptiEnableDomain(uint32_t enable,
+                                       CUpti_SubscriberHandle subscriber,
+                                       CUpti_CallbackDomain domain);
+
+/**
+ * \brief Enable or disable all callbacks in all domains.
+ *
+ * Enable or disable all callbacks in all domains.
+ *
+ * \note \b Thread-safety: a subscriber must serialize access to
+ * cuptiGetCallbackState, cuptiEnableCallback, cuptiEnableDomain, and
+ * cuptiEnableAllDomains. For example, if cuptiGetCallbackState(sub,
+ * d, *) and cuptiEnableAllDomains(sub) are called concurrently, the
+ * results are undefined.
+ *
+ * \param enable New enable state for all callbacks in all
+ * domain. Zero disables all callbacks, non-zero enables all
+ * callbacks.
+ * \param subscriber - Handle to callback subscription
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if unable to initialized CUPTI
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p subscriber is invalid
+ */
+CUptiResult CUPTIAPI cuptiEnableAllDomains(uint32_t enable,
+                                           CUpti_SubscriberHandle subscriber);
+
+/**
+ * \brief Get the name of a callback for a specific domain and callback ID.
+ *
+ * Returns a pointer to the name c_string in \p **name.
+ *
+ * \note \b Names are available only for the DRIVER and RUNTIME domains.
+ *
+ * \param domain The domain of the callback
+ * \param cbid The ID of the callback
+ * \param name Returns pointer to the name string on success, NULL otherwise
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p name is NULL, or if
+ * \p domain or \p cbid is invalid.
+ */
+CUptiResult CUPTIAPI cuptiGetCallbackName(CUpti_CallbackDomain domain,
+                                          uint32_t cbid,
+                                          const char **name);
+
+/** @} */ /* END CUPTI_CALLBACK_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif  // file guard
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_checkpoint.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_checkpoint.h
new file mode 100644
index 0000000000000000000000000000000000000000..36eeddc4e2b7bfd1902ce313d71f173db70beaef
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_checkpoint.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <cuda.h>
+#include <cupti_result.h>
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace NV { namespace Cupti { namespace Checkpoint {
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/**
+ * \defgroup CUPTI_CHECKPOINT_API CUPTI Checkpoint API
+ * Functions, types, and enums that implement the CUPTI Checkpoint API.
+ * @{
+ */
+
+/**
+ * \brief Specifies optimization options for a checkpoint, may be OR'd together to specify multiple options.
+ */
+typedef enum
+{
+    CUPTI_CHECKPOINT_OPT_NONE     = 0, //!< Default behavior
+    CUPTI_CHECKPOINT_OPT_TRANSFER = 1, //!< Determine which mem blocks have changed, and only restore those. This optimization is cached, which means cuptiCheckpointRestore must always be called at the same point in the application when this option is enabled, or the result may be incorrect.
+} CUpti_CheckpointOptimizations;
+
+/**
+ * \brief Configuration and handle for a CUPTI Checkpoint
+ *
+ * A CUptiCheckpoint object should be initialized with desired options prior to passing into any
+ * CUPTI Checkpoint API function.  The first call into a Checkpoint API function will initialize internal
+ * state based on these options.  Subsequent changes to these options will not have any effect.
+ *
+ * Checkpoint data is saved in device, host, and filesystem space.  There are options to reserve memory
+ * at each level (device, host, filesystem) which are intended to allow a guarantee that a certain amount
+ * of memory will remain free for use after the checkpoint is saved.
+ * Note, however, that falling back to slower levels of memory (host, and then filesystem) to save the checkpoint
+ * will result in performance degradation.
+ * Currently, the filesystem limitation is not implemented.  Note that falling back to filesystem storage may
+ * significantly impact the performance for saving and restoring a checkpoint.
+ */
+typedef struct
+{
+   size_t structSize;      //!< [in] Must be set to CUpti_Checkpoint_STRUCT_SIZE
+
+   CUcontext ctx;          //!< [in] Set to context to save from, or will use current context if NULL
+
+   size_t reserveDeviceMB; //!< [in] Restrict checkpoint from using last N MB of device memory (-1 = use no device memory)
+   size_t reserveHostMB;   //!< [in] Restrict checkpoint from using last N MB of host memory (-1 = use no host memory)
+   uint8_t allowOverwrite; //!< [in] Boolean, Allow checkpoint to save over existing checkpoint
+   uint8_t optimizations;  //!< [in] Mask of CUpti_CheckpointOptimizations flags for this checkpoint
+
+   void * pPriv;           //!< [in] Assign to NULL
+} CUpti_Checkpoint;
+
+#define CUpti_Checkpoint_STRUCT_SIZE  \
+(offsetof(CUpti_Checkpoint, pPriv) +  \
+sizeof(((CUpti_Checkpoint*)(nullptr))->pPriv))
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \brief Initialize and save a checkpoint of the device state associated with the handle context
+ *
+ * Uses the handle options to configure and save a checkpoint of the device state associated with the specified context.
+ *
+ * \param handle A pointer to a CUpti_Checkpoint object
+ *
+ * \retval CUPTI_SUCCESS if a checkpoint was successfully initialized and saved
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p handle does not appear to refer to a valid CUpti_Checkpoint
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_DEVICE if device associated with context is not compatible with checkpoint API
+ * \retval CUPTI_ERROR_INVALID_OPERATION if Save is requested over an existing checkpoint, but \p allowOverwrite was not originally specified
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY if as configured, not enough backing storage space to save the checkpoint
+ */
+CUptiResult cuptiCheckpointSave(CUpti_Checkpoint * const handle);
+
+/**
+ * \brief Restore a checkpoint to the device associated with its context
+ *
+ * Restores device, pinned, and allocated memory to the state when the checkpoint was saved
+ *
+ * \param handle A pointer to a previously saved CUpti_Checkpoint object
+ *
+ * \retval CUTPI_SUCCESS if the checkpoint was successfully restored
+ * \retval CUPTI_ERROR_NOT_INITIALIZED if the checkpoint was not previously initialized
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if the handle appears invalid
+ * \retval CUPTI_ERROR_UNKNOWN if the restore or optimization operation fails
+ */
+CUptiResult cuptiCheckpointRestore(CUpti_Checkpoint * const handle);
+
+/**
+ * \brief Free the backing data for a checkpoint
+ *
+ * Frees all associated device, host memory and filesystem storage used for this context.
+ * After freeing a handle, it may be re-used as if it was new - options may be re-configured and will
+ * take effect on the next call to \p cuptiCheckpointSave.
+ *
+ * \param handle A pointer to a previously saved CUpti_Checkpoint object
+ *
+ * \retval CUPTI_SUCCESS if the handle was successfully freed
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if the handle was already freed or appears invalid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if the context is no longer valid
+ */
+CUptiResult cuptiCheckpointFree(CUpti_Checkpoint * const handle);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+// Exit namespace NV::Cupti::Checkpoint
+}}}
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_common.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d228c4df3c1f090a4979bfe10132e080042fef
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_common.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+ 
+#if !defined(__CUPTI_COMMON_H__)
+#define __CUPTI_COMMON_H__
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#ifndef CUPTIUTILAPI
+#ifdef _WIN32
+#define CUPTIUTILAPI __stdcall
+#else
+#define CUPTIUTILAPI
+#endif
+#endif
+
+#if defined(__LP64__)
+#define CUPTILP64 1
+#elif defined(_WIN64)
+#define CUPTILP64 1
+#else
+#undef CUPTILP64
+#endif
+
+#define ACTIVITY_RECORD_ALIGNMENT 8
+#if defined(_WIN32) // Windows 32- and 64-bit
+#define START_PACKED_ALIGNMENT __pragma(pack(push,1)) // exact fit - no padding
+#define PACKED_ALIGNMENT __declspec(align(ACTIVITY_RECORD_ALIGNMENT))
+#define END_PACKED_ALIGNMENT __pragma(pack(pop))
+#elif defined(__GNUC__) // GCC
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT __attribute__ ((__packed__)) __attribute__ ((aligned (ACTIVITY_RECORD_ALIGNMENT)))
+#define END_PACKED_ALIGNMENT
+#else // all other compilers
+#define START_PACKED_ALIGNMENT
+#define PACKED_ALIGNMENT
+#define END_PACKED_ALIGNMENT
+#endif
+
+#endif /*__CUPTI_COMMON_H__*/
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..331f2e4d63ff18f677c763570845bdc77fa25faf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_driver_cbid.h
@@ -0,0 +1,767 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+typedef enum CUpti_driver_api_trace_cbid_enum {
+    CUPTI_DRIVER_TRACE_CBID_INVALID                                                        = 0,
+    CUPTI_DRIVER_TRACE_CBID_cuInit                                                         = 1,
+    CUPTI_DRIVER_TRACE_CBID_cuDriverGetVersion                                             = 2,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGet                                                    = 3,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetCount                                               = 4,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetName                                                = 5,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceComputeCapability                                      = 6,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem                                               = 7,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetProperties                                          = 8,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetAttribute                                           = 9,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate                                                    = 10,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy                                                   = 11,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxAttach                                                    = 12,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDetach                                                    = 13,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent                                               = 14,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent                                                = 15,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevice                                                 = 16,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSynchronize                                               = 17,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoad                                                   = 18,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadData                                               = 19,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadDataEx                                             = 20,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleLoadFatBinary                                          = 21,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleUnload                                                 = 22,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunction                                            = 23,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal                                              = 24,
+    CUPTI_DRIVER_TRACE_CBID_cu64ModuleGetGlobal                                            = 25,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetTexRef                                              = 26,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo                                                   = 27,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetInfo                                                 = 28,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc                                                     = 29,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAlloc                                                   = 30,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch                                                = 31,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemAllocPitch                                              = 32,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree                                                      = 33,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemFree                                                    = 34,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange                                           = 35,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemGetAddressRange                                         = 36,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost                                                 = 37,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeHost                                                  = 38,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc                                                 = 39,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer                                      = 40,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostGetDevicePointer                                    = 41,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetFlags                                              = 42,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD                                                   = 43,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoD                                                 = 44,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH                                                   = 45,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoH                                                 = 46,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD                                                   = 47,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoD                                                 = 48,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA                                                   = 49,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoA                                                 = 50,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD                                                   = 51,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyAtoD                                                 = 52,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA                                                   = 53,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH                                                   = 54,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA                                                   = 55,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D                                                     = 56,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned                                            = 57,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D                                                     = 58,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3D                                                   = 59,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync                                              = 60,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyHtoDAsync                                            = 61,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync                                              = 62,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoHAsync                                            = 63,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync                                              = 64,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemcpyDtoDAsync                                            = 65,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync                                              = 66,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync                                              = 67,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync                                                = 68,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync                                                = 69,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy3DAsync                                              = 70,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8                                                     = 71,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8                                                   = 72,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16                                                    = 73,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16                                                  = 74,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32                                                    = 75,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32                                                  = 76,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8                                                   = 77,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8                                                 = 78,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16                                                  = 79,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16                                                = 80,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32                                                  = 81,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32                                                = 82,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetBlockShape                                            = 83,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedSize                                            = 84,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetAttribute                                             = 85,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetCacheConfig                                           = 86,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate                                                  = 87,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor                                           = 88,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayDestroy                                                 = 89,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate                                                = 90,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor                                         = 91,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefCreate                                                 = 92,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefDestroy                                                = 93,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetArray                                               = 94,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress                                             = 95,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress                                           = 96,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D                                           = 97,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefSetAddress2D                                         = 98,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFormat                                              = 99,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddressMode                                         = 100,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFilterMode                                          = 101,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetFlags                                               = 102,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress                                             = 103,
+    CUPTI_DRIVER_TRACE_CBID_cu64TexRefGetAddress                                           = 104,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetArray                                               = 105,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddressMode                                         = 106,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFilterMode                                          = 107,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFormat                                              = 108,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetFlags                                               = 109,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetSize                                                 = 110,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSeti                                                    = 111,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetf                                                    = 112,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetv                                                    = 113,
+    CUPTI_DRIVER_TRACE_CBID_cuParamSetTexRef                                               = 114,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunch                                                       = 115,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGrid                                                   = 116,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchGridAsync                                              = 117,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreate                                                  = 118,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord                                                  = 119,
+    CUPTI_DRIVER_TRACE_CBID_cuEventQuery                                                   = 120,
+    CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize                                             = 121,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy                                                 = 122,
+    CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime                                             = 123,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreate                                                 = 124,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery                                                  = 125,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize                                            = 126,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy                                                = 127,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnregisterResource                                   = 128,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsSubResourceGetMappedArray                            = 129,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer                             = 130,
+    CUPTI_DRIVER_TRACE_CBID_cu64GraphicsResourceGetMappedPointer                           = 131,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags                                  = 132,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources                                         = 133,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources                                       = 134,
+    CUPTI_DRIVER_TRACE_CBID_cuGetExportTable                                               = 135,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetLimit                                                  = 136,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetLimit                                                  = 137,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevice                                               = 138,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate                                               = 139,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D10RegisterResource                                = 140,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10RegisterResource                                        = 141,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnregisterResource                                      = 142,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10MapResources                                            = 143,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10UnmapResources                                          = 144,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceSetMapFlags                                     = 145,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedArray                                  = 146,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer                                = 147,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize                                   = 148,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch                                  = 149,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions                            = 150,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevice                                               = 151,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate                                               = 152,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D11RegisterResource                                = 153,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevice                                                = 154,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate                                                = 155,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsD3D9RegisterResource                                 = 156,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDirect3DDevice                                        = 157,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterResource                                         = 158,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterResource                                       = 159,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapResources                                             = 160,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapResources                                           = 161,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceSetMapFlags                                      = 162,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions                             = 163,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedArray                                   = 164,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer                                 = 165,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize                                    = 166,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch                                   = 167,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9Begin                                                    = 168,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9End                                                      = 169,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9RegisterVertexBuffer                                     = 170,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer                                          = 171,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnmapVertexBuffer                                        = 172,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9UnregisterVertexBuffer                                   = 173,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate                                                  = 174,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterBuffer                                     = 175,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsGLRegisterImage                                      = 176,
+    CUPTI_DRIVER_TRACE_CBID_cuWGLGetDevice                                                 = 177,
+    CUPTI_DRIVER_TRACE_CBID_cuGLInit                                                       = 178,
+    CUPTI_DRIVER_TRACE_CBID_cuGLRegisterBufferObject                                       = 179,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject                                            = 180,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObject                                          = 181,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnregisterBufferObject                                     = 182,
+    CUPTI_DRIVER_TRACE_CBID_cuGLSetBufferObjectMapFlags                                    = 183,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync                                       = 184,
+    CUPTI_DRIVER_TRACE_CBID_cuGLUnmapBufferObjectAsync                                     = 185,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUGetDevice                                               = 186,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate                                               = 187,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterVideoSurface                            = 188,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsVDPAURegisterOutputSurface                           = 189,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetSurfRef                                             = 190,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefCreate                                                = 191,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefDestroy                                               = 192,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetFormat                                             = 193,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefSetArray                                              = 194,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetFormat                                             = 195,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfRefGetArray                                              = 196,
+    CUPTI_DRIVER_TRACE_CBID_cu64DeviceTotalMem                                             = 197,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPointer                              = 198,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedSize                                 = 199,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetMappedPitch                                = 200,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D10ResourceGetSurfaceDimensions                          = 201,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetSurfaceDimensions                           = 202,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPointer                               = 203,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedSize                                  = 204,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9ResourceGetMappedPitch                                 = 205,
+    CUPTI_DRIVER_TRACE_CBID_cu64D3D9MapVertexBuffer                                        = 206,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObject                                          = 207,
+    CUPTI_DRIVER_TRACE_CBID_cu64GLMapBufferObjectAsync                                     = 208,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDevices                                              = 209,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreateOnDevice                                       = 210,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDevices                                              = 211,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreateOnDevice                                       = 212,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9GetDevices                                               = 213,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreateOnDevice                                        = 214,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemHostAlloc                                               = 215,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async                                                = 216,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD8Async                                              = 217,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async                                               = 218,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD16Async                                             = 219,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async                                               = 220,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD32Async                                             = 221,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async                                              = 222,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D8Async                                            = 223,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async                                             = 224,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D16Async                                           = 225,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async                                             = 226,
+    CUPTI_DRIVER_TRACE_CBID_cu64MemsetD2D32Async                                           = 227,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayCreate                                                = 228,
+    CUPTI_DRIVER_TRACE_CBID_cu64ArrayGetDescriptor                                         = 229,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DCreate                                              = 230,
+    CUPTI_DRIVER_TRACE_CBID_cu64Array3DGetDescriptor                                       = 231,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2D                                                   = 232,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DUnaligned                                          = 233,
+    CUPTI_DRIVER_TRACE_CBID_cu64Memcpy2DAsync                                              = 234,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v2                                                 = 235,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10CtxCreate_v2                                            = 236,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11CtxCreate_v2                                            = 237,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9CtxCreate_v2                                             = 238,
+    CUPTI_DRIVER_TRACE_CBID_cuGLCtxCreate_v2                                               = 239,
+    CUPTI_DRIVER_TRACE_CBID_cuVDPAUCtxCreate_v2                                            = 240,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetGlobal_v2                                           = 241,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetInfo_v2                                                = 242,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2                                                  = 243,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2                                             = 244,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2                                                   = 245,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAddressRange_v2                                        = 246,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostGetDevicePointer_v2                                   = 247,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_v2                                                    = 248,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2                                                  = 249,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2                                                 = 250,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2                                                 = 251,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2                                                = 252,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2                                               = 253,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2                                               = 254,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress_v2                                          = 255,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v2                                        = 256,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetAddress_v2                                          = 257,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedPointer_v2                          = 258,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceTotalMem_v2                                            = 259,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPointer_v2                             = 260,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedSize_v2                                = 261,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetMappedPitch_v2                               = 262,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10ResourceGetSurfaceDimensions_v2                         = 263,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetSurfaceDimensions_v2                          = 264,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPointer_v2                              = 265,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedSize_v2                                 = 266,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9ResourceGetMappedPitch_v2                                = 267,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D9MapVertexBuffer_v2                                       = 268,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2                                         = 269,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2                                    = 270,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostAlloc_v2                                              = 271,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayCreate_v2                                               = 272,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetDescriptor_v2                                        = 273,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DCreate_v2                                             = 274,
+    CUPTI_DRIVER_TRACE_CBID_cuArray3DGetDescriptor_v2                                      = 275,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2                                                = 276,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2                                           = 277,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2                                                = 278,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2                                           = 279,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2                                                = 280,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2                                           = 281,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2                                                = 282,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2                                           = 283,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2                                                = 284,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2                                                = 285,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2                                                = 286,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2                                                  = 287,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2                                         = 288,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2                                             = 289,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2                                                  = 290,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2                                             = 291,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2                                                = 292,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2                                           = 293,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocHost_v2                                              = 294,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent                                              = 295,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetApiVersion                                             = 296,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D10GetDirect3DDevice                                       = 297,
+    CUPTI_DRIVER_TRACE_CBID_cuD3D11GetDirect3DDevice                                       = 298,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCacheConfig                                            = 299,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCacheConfig                                            = 300,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister                                              = 301,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostUnregister                                            = 302,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent                                                = 303,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetCurrent                                                = 304,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy                                                       = 305,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync                                                  = 306,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel                                                 = 307,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStart                                                = 308,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerStop                                                 = 309,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttribute                                          = 310,
+    CUPTI_DRIVER_TRACE_CBID_cuProfilerInitialize                                           = 311,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceCanAccessPeer                                          = 312,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxEnablePeerAccess                                          = 313,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDisablePeerAccess                                         = 314,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerRegister                                              = 315,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerUnregister                                            = 316,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPeerGetDevicePointer                                      = 317,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer                                                   = 318,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync                                              = 319,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer                                                 = 320,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync                                            = 321,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2                                                = 322,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPushCurrent_v2                                            = 323,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxPopCurrent_v2                                             = 324,
+    CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2                                              = 325,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamDestroy_v2                                             = 326,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetAddress2D_v3                                        = 327,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetMemHandle                                              = 328,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle                                             = 329,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcCloseMemHandle                                            = 330,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetByPCIBusId                                          = 331,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetPCIBusId                                            = 332,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices                                                 = 333,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcGetEventHandle                                            = 334,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenEventHandle                                           = 335,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetSharedMemConfig                                        = 336,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetSharedMemConfig                                        = 337,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetSharedMemConfig                                       = 338,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectCreate                                              = 339,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectDestroy                                             = 340,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceDesc                                     = 341,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetTextureDesc                                      = 342,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectCreate                                             = 343,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectDestroy                                            = 344,
+    CUPTI_DRIVER_TRACE_CBID_cuSurfObjectGetResourceDesc                                    = 345,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback                                            = 346,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayCreate                                         = 347,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetLevel                                       = 348,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayDestroy                                        = 349,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmappedArray                                      = 350,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapFilterMode                                    = 351,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelBias                                     = 352,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMipmapLevelClamp                                    = 353,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetMaxAnisotropy                                       = 354,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmappedArray                                      = 355,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapFilterMode                                    = 356,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelBias                                     = 357,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMipmapLevelClamp                                    = 358,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetMaxAnisotropy                                       = 359,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedMipmappedArray                      = 360,
+    CUPTI_DRIVER_TRACE_CBID_cuTexObjectGetResourceViewDesc                                 = 361,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate                                                   = 362,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData                                                  = 363,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile                                                  = 364,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkComplete                                                 = 365,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkDestroy                                                  = 366,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCreateWithPriority                                     = 367,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority                                            = 368,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags                                               = 369,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetStreamPriorityRange                                    = 370,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocManaged                                              = 371,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorString                                               = 372,
+    CUPTI_DRIVER_TRACE_CBID_cuGetErrorName                                                 = 373,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessor                    = 374,
+    CUPTI_DRIVER_TRACE_CBID_cuCompilePtx                                                   = 375,
+    CUPTI_DRIVER_TRACE_CBID_cuBinaryFree                                                   = 376,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync                                         = 377,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerSetAttribute                                          = 378,
+    CUPTI_DRIVER_TRACE_CBID_cuMemHostRegister_v2                                           = 379,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceSetMapFlags_v2                               = 380,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkCreate_v2                                                = 381,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddData_v2                                               = 382,
+    CUPTI_DRIVER_TRACE_CBID_cuLinkAddFile_v2                                               = 383,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSize                               = 384,
+    CUPTI_DRIVER_TRACE_CBID_cuGLGetDevices_v2                                              = 385,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRetain                                       = 386,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease                                      = 387,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags                                     = 388,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset                                        = 389,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsEGLRegisterImage                                     = 390,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetFlags                                                  = 391,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxGetState                                     = 392,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnect                                     = 393,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerDisconnect                                  = 394,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerAcquireFrame                                = 395,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerReleaseFrame                                = 396,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2_ptds                                           = 397,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2_ptds                                           = 398,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2_ptds                                           = 399,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2_ptds                                           = 400,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2_ptds                                           = 401,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2_ptds                                           = 402,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2_ptds                                           = 403,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2_ptds                                           = 404,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2_ptds                                             = 405,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2_ptds                                    = 406,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2_ptds                                             = 407,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy_ptds                                                  = 408,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer_ptds                                              = 409,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeer_ptds                                            = 410,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2_ptds                                             = 411,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2_ptds                                            = 412,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2_ptds                                            = 413,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2_ptds                                           = 414,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2_ptds                                          = 415,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2_ptds                                          = 416,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObject_v2_ptds                                    = 417,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync_ptsz                                             = 418,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2_ptsz                                      = 419,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2_ptsz                                      = 420,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2_ptsz                                      = 421,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2_ptsz                                      = 422,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2_ptsz                                      = 423,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2_ptsz                                        = 424,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2_ptsz                                        = 425,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync_ptsz                                         = 426,
+    CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DPeerAsync_ptsz                                       = 427,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async_ptsz                                           = 428,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async_ptsz                                          = 429,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async_ptsz                                          = 430,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async_ptsz                                         = 431,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async_ptsz                                        = 432,
+    CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async_ptsz                                        = 433,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetPriority_ptsz                                       = 434,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetFlags_ptsz                                          = 435,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitEvent_ptsz                                         = 436,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAddCallback_ptsz                                       = 437,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamAttachMemAsync_ptsz                                    = 438,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamQuery_ptsz                                             = 439,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize_ptsz                                       = 440,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecord_ptsz                                             = 441,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel_ptsz                                            = 442,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsMapResources_ptsz                                    = 443,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsUnmapResources_ptsz                                  = 444,
+    CUPTI_DRIVER_TRACE_CBID_cuGLMapBufferObjectAsync_v2_ptsz                               = 445,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerConnect                                     = 446,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerDisconnect                                  = 447,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerPresentFrame                                = 448,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphicsResourceGetMappedEglFrame                            = 449,
+    CUPTI_DRIVER_TRACE_CBID_cuPointerGetAttributes                                         = 450,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags           = 451,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialBlockSizeWithFlags                      = 452,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamProducerReturnFrame                                 = 453,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetP2PAttribute                                        = 454,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefSetBorderColor                                         = 455,
+    CUPTI_DRIVER_TRACE_CBID_cuTexRefGetBorderColor                                         = 456,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise                                                    = 457,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32                                            = 458,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_ptsz                                       = 459,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32                                           = 460,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_ptsz                                      = 461,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp                                             = 462,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_ptsz                                        = 463,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNbufferGetPointer                                          = 464,
+    CUPTI_DRIVER_TRACE_CBID_cuNVNtextureGetArray                                           = 465,
+    CUPTI_DRIVER_TRACE_CBID_cuNNSetAllocator                                               = 466,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync                                             = 467,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_ptsz                                        = 468,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromNVNSync                                       = 469,
+    CUPTI_DRIVER_TRACE_CBID_cuEGLStreamConsumerConnectWithFlags                            = 470,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttribute                                         = 471,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRangeGetAttributes                                        = 472,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64                                            = 473,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_ptsz                                       = 474,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64                                           = 475,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_ptsz                                      = 476,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel                                      = 477,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel_ptsz                                 = 478,
+    CUPTI_DRIVER_TRACE_CBID_cuEventCreateFromEGLSync                                       = 479,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice                           = 480,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncSetAttribute                                             = 481,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid                                                = 482,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx                                                 = 483,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCtx_ptsz                                            = 484,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalMemory                                         = 485,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedBuffer                                = 486,
+    CUPTI_DRIVER_TRACE_CBID_cuExternalMemoryGetMappedMipmappedArray                        = 487,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalMemory                                        = 488,
+    CUPTI_DRIVER_TRACE_CBID_cuImportExternalSemaphore                                      = 489,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync                                = 490,
+    CUPTI_DRIVER_TRACE_CBID_cuSignalExternalSemaphoresAsync_ptsz                           = 491,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync                                  = 492,
+    CUPTI_DRIVER_TRACE_CBID_cuWaitExternalSemaphoresAsync_ptsz                             = 493,
+    CUPTI_DRIVER_TRACE_CBID_cuDestroyExternalSemaphore                                     = 494,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture                                           = 495,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_ptsz                                      = 496,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture                                             = 497,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamEndCapture_ptsz                                        = 498,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing                                            = 499,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamIsCapturing_ptsz                                       = 500,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphCreate                                                  = 501,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode                                           = 502,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams                                     = 503,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemcpyNode                                           = 504,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeGetParams                                     = 505,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemsetNode                                           = 506,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeGetParams                                     = 507,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemsetNodeSetParams                                     = 508,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetType                                             = 509,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetRootNodes                                            = 510,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies                                     = 511,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes                                   = 512,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate                                             = 513,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch                                                  = 514,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphLaunch_ptsz                                             = 515,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecDestroy                                             = 516,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroy                                                 = 517,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies                                         = 518,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies                                      = 519,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemcpyNodeSetParams                                     = 520,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams                                     = 521,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDestroyNode                                             = 522,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphClone                                                   = 523,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeFindInClone                                         = 524,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddChildGraphNode                                       = 525,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEmptyNode                                            = 526,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc                                               = 527,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchHostFunc_ptsz                                          = 528,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphChildGraphNodeGetGraph                                  = 529,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddHostNode                                             = 530,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeGetParams                                       = 531,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetLuid                                                = 532,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphHostNodeSetParams                                       = 533,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetNodes                                                = 534,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges                                                = 535,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo                                         = 536,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_ptsz                                    = 537,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams                                 = 538,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2                                        = 539,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCapture_v2_ptsz                                   = 540,
+    CUPTI_DRIVER_TRACE_CBID_cuThreadExchangeStreamCaptureMode                              = 541,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetNvSciSyncAttributes                                 = 542,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyAvailableDynamicSMemPerBlock                        = 543,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxRelease_v2                                   = 544,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxReset_v2                                     = 545,
+    CUPTI_DRIVER_TRACE_CBID_cuDevicePrimaryCtxSetFlags_v2                                  = 546,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressReserve                                            = 547,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAddressFree                                               = 548,
+    CUPTI_DRIVER_TRACE_CBID_cuMemCreate                                                    = 549,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRelease                                                   = 550,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMap                                                       = 551,
+    CUPTI_DRIVER_TRACE_CBID_cuMemUnmap                                                     = 552,
+    CUPTI_DRIVER_TRACE_CBID_cuMemSetAccess                                                 = 553,
+    CUPTI_DRIVER_TRACE_CBID_cuMemExportToShareableHandle                                   = 554,
+    CUPTI_DRIVER_TRACE_CBID_cuMemImportFromShareableHandle                                 = 555,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationGranularity                                  = 556,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAllocationPropertiesFromHandle                         = 557,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetAccess                                                 = 558,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags                                               = 559,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetFlags_ptsz                                          = 560,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate                                              = 561,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemcpyNodeSetParams                                 = 562,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecMemsetNodeSetParams                                 = 563,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecHostNodeSetParams                                   = 564,
+    CUPTI_DRIVER_TRACE_CBID_cuMemRetainAllocationHandle                                    = 565,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetModule                                                = 566,
+    CUPTI_DRIVER_TRACE_CBID_cuIpcOpenMemHandle_v2                                          = 567,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxResetPersistingL2Cache                                    = 568,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeCopyAttributes                                = 569,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetAttribute                                  = 570,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetAttribute                                  = 571,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes                                         = 572,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamCopyAttributes_ptsz                                    = 573,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute                                           = 574,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetAttribute_ptsz                                      = 575,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute                                           = 576,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamSetAttribute_ptsz                                      = 577,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiate_v2                                          = 578,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetTexture1DLinearMaxWidth                             = 579,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload                                                  = 580,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphUpload_ptsz                                             = 581,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetSparseProperties                                     = 582,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetSparseProperties                            = 583,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync                                             = 584,
+    CUPTI_DRIVER_TRACE_CBID_cuMemMapArrayAsync_ptsz                                        = 585,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecChildGraphNodeSetParams                             = 586,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags                                         = 587,
+    CUPTI_DRIVER_TRACE_CBID_cuEventRecordWithFlags_ptsz                                    = 588,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventRecordNode                                      = 589,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddEventWaitNode                                        = 590,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeGetEvent                                 = 591,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeGetEvent                                   = 592,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventRecordNodeSetEvent                                 = 593,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphEventWaitNodeSetEvent                                   = 594,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventRecordNodeSetEvent                             = 595,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecEventWaitNodeSetEvent                               = 596,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetPlane                                                = 597,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync                                                = 598,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocAsync_ptsz                                           = 599,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync                                                 = 600,
+    CUPTI_DRIVER_TRACE_CBID_cuMemFreeAsync_ptsz                                            = 601,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolTrimTo                                                = 602,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAttribute                                          = 603,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAttribute                                          = 604,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolSetAccess                                             = 605,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDefaultMemPool                                      = 606,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolCreate                                                = 607,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolDestroy                                               = 608,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetMemPool                                             = 609,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetMemPool                                             = 610,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync                                        = 611,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAllocFromPoolAsync_ptsz                                   = 612,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportToShareableHandle                               = 613,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportFromShareableHandle                             = 614,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolExportPointer                                         = 615,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolImportPointer                                         = 616,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPoolGetAccess                                             = 617,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresSignalNode                         = 618,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeGetParams                   = 619,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresSignalNodeSetParams                   = 620,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddExternalSemaphoresWaitNode                           = 621,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeGetParams                     = 622,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExternalSemaphoresWaitNodeSetParams                     = 623,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresSignalNodeSetParams               = 624,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecExternalSemaphoresWaitNodeSetParams                 = 625,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress                                               = 626,
+    CUPTI_DRIVER_TRACE_CBID_cuFlushGPUDirectRDMAWrites                                     = 627,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphDebugDotPrint                                           = 628,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2                                      = 629,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v2_ptsz                                 = 630,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies                              = 631,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_ptsz                         = 632,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectCreate                                             = 633,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRetain                                             = 634,
+    CUPTI_DRIVER_TRACE_CBID_cuUserObjectRelease                                            = 635,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRetainUserObject                                        = 636,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphReleaseUserObject                                       = 637,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemAllocNode                                         = 638,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddMemFreeNode                                          = 639,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGraphMemTrim                                           = 640,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetGraphMemAttribute                                   = 641,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceSetGraphMemAttribute                                   = 642,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithFlags                                    = 643,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetExecAffinitySupport                                 = 644,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxCreate_v3                                                 = 645,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetExecAffinity                                           = 646,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetUuid_v2                                             = 647,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemAllocNodeGetParams                                   = 648,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphMemFreeNodeGetParams                                    = 649,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetEnabled                                          = 650,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetEnabled                                          = 651,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx                                               = 652,
+    CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx_ptsz                                          = 653,
+    CUPTI_DRIVER_TRACE_CBID_cuArrayGetMemoryRequirements                                   = 654,
+    CUPTI_DRIVER_TRACE_CBID_cuMipmappedArrayGetMemoryRequirements                          = 655,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams                                   = 656,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphInstantiateWithParams_ptsz                              = 657,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecGetFlags                                            = 658,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2                                         = 659,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue32_v2_ptsz                                    = 660,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2                                         = 661,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWaitValue64_v2_ptsz                                    = 662,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2                                        = 663,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue32_v2_ptsz                                   = 664,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2                                        = 665,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamWriteValue64_v2_ptsz                                   = 666,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2                                          = 667,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBatchMemOp_v2_ptsz                                     = 668,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddBatchMemOpNode                                       = 669,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeGetParams                                 = 670,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphBatchMemOpNodeSetParams                                 = 671,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecBatchMemOpNodeSetParams                             = 672,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetLoadingMode                                         = 673,
+    CUPTI_DRIVER_TRACE_CBID_cuMemGetHandleForAddressRange                                  = 674,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxPotentialClusterSize                             = 675,
+    CUPTI_DRIVER_TRACE_CBID_cuOccupancyMaxActiveClusters                                   = 676,
+    CUPTI_DRIVER_TRACE_CBID_cuGetProcAddress_v2                                            = 677,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryLoadData                                              = 678,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryLoadFromFile                                          = 679,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryUnload                                                = 680,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetKernel                                             = 681,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetModule                                             = 682,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetFunction                                            = 683,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetGlobal                                             = 684,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetManaged                                            = 685,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetAttribute                                           = 686,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelSetAttribute                                           = 687,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelSetCacheConfig                                         = 688,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddKernelNode_v2                                        = 689,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeGetParams_v2                                  = 690,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphKernelNodeSetParams_v2                                  = 691,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecKernelNodeSetParams_v2                              = 692,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetId                                                  = 693,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetId_ptsz                                             = 694,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetId                                                     = 695,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecUpdate_v2                                           = 696,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeTiled                                         = 697,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapEncodeIm2col                                        = 698,
+    CUPTI_DRIVER_TRACE_CBID_cuTensorMapReplaceAddress                                      = 699,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetUnifiedFunction                                    = 700,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpGetAttribute                                         = 701,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpGetAttributeGlobal                                   = 702,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpSetAttribute                                         = 703,
+    CUPTI_DRIVER_TRACE_CBID_cuCoredumpSetAttributeGlobal                                   = 704,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxSetFlags                                                  = 705,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastCreate                                              = 706,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastAddDevice                                           = 707,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastBindMem                                             = 708,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastBindAddr                                            = 709,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastUnbind                                              = 710,
+    CUPTI_DRIVER_TRACE_CBID_cuMulticastGetGranularity                                      = 711,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode                                                 = 712,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeSetParams                                           = 713,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphExecNodeSetParams                                       = 714,
+    CUPTI_DRIVER_TRACE_CBID_cuMemAdvise_v2                                                 = 715,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_v2                                          = 716,
+    CUPTI_DRIVER_TRACE_CBID_cuMemPrefetchAsync_v2_ptsz                                     = 717,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetName                                                  = 718,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetName                                                = 719,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCaptureToGraph                                    = 720,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamBeginCaptureToGraph_ptsz                               = 721,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphConditionalHandleCreate                                 = 722,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddNode_v2                                              = 723,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphGetEdges_v2                                             = 724,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependencies_v2                                  = 725,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphNodeGetDependentNodes_v2                                = 726,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphAddDependencies_v2                                      = 727,
+    CUPTI_DRIVER_TRACE_CBID_cuGraphRemoveDependencies_v2                                   = 728,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v3                                      = 729,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetCaptureInfo_v3_ptsz                                 = 730,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_v2                           = 731,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamUpdateCaptureDependencies_v2_ptsz                      = 732,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncGetParamInfo                                             = 733,
+    CUPTI_DRIVER_TRACE_CBID_cuKernelGetParamInfo                                           = 734,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceRegisterAsyncNotification                              = 735,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceUnregisterAsyncNotification                            = 736,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleGetFunctionCount                                       = 737,
+    CUPTI_DRIVER_TRACE_CBID_cuModuleEnumerateFunctions                                     = 738,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryGetKernelCount                                        = 739,
+    CUPTI_DRIVER_TRACE_CBID_cuLibraryEnumerateKernels                                      = 740,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncIsLoaded                                                 = 741,
+    CUPTI_DRIVER_TRACE_CBID_cuFuncLoad                                                     = 742,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxCreate                                               = 743,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxDestroy                                              = 744,
+    CUPTI_DRIVER_TRACE_CBID_cuDeviceGetDevResource                                         = 745,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxGetDevResource                                            = 746,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxGetDevResource                                       = 747,
+    CUPTI_DRIVER_TRACE_CBID_cuDevResourceGenerateDesc                                      = 748,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxRecordEvent                                          = 749,
+    CUPTI_DRIVER_TRACE_CBID_cuGreenCtxWaitEvent                                            = 750,
+    CUPTI_DRIVER_TRACE_CBID_cuDevSmResourceSplitByCount                                    = 751,
+    CUPTI_DRIVER_TRACE_CBID_cuStreamGetGreenCtx                                            = 752,
+    CUPTI_DRIVER_TRACE_CBID_cuCtxFromGreenCtx                                              = 753,
+    CUPTI_DRIVER_TRACE_CBID_SIZE                                                           = 754,
+    CUPTI_DRIVER_TRACE_CBID_FORCE_INT                                                      = 0x7fffffff
+} CUpti_driver_api_trace_cbid;
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h
new file mode 100644
index 0000000000000000000000000000000000000000..142d70fd10f6ba5b680c0917c475208e657e94a0
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_events.h
@@ -0,0 +1,1350 @@
+/*
+ * Copyright 2010-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_EVENTS_H_)
+#define _CUPTI_EVENTS_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_EVENT_API CUPTI Event API
+ * Functions, types, and enums that implement the CUPTI Event API.
+ *
+ * \note CUPTI event API from the header cupti_events.h are not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * These API will be deprecated in a future CUDA release. These are replaced by
+ * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+ * in the headers nvperf_host.h and nvperf_target.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+ * architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for an event.
+ *
+ * An event represents a countable activity, action, or occurrence on
+ * the device.
+ */
+typedef uint32_t CUpti_EventID;
+
+/**
+ * \brief ID for an event domain.
+ *
+ * ID for an event domain. An event domain represents a group of
+ * related events. A device may have multiple instances of a domain,
+ * indicating that the device can simultaneously record multiple
+ * instances of each event within that domain.
+ */
+typedef uint32_t CUpti_EventDomainID;
+
+/**
+ * \brief A group of events.
+ *
+ * An event group is a collection of events that are managed
+ * together. All events in an event group must belong to the same
+ * domain.
+ */
+typedef void *CUpti_EventGroup;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for device attribute
+ * CUPTI_DEVICE_ATTR_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA              = 0,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_QUADRO             = 1,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_GEFORCE            = 2,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TEGRA              = 3,
+} CUpti_DeviceAttributeDeviceClass;
+
+/**
+ * \brief Device attributes.
+ *
+ * CUPTI device attributes. These attributes can be read using \ref
+ * cuptiDeviceGetAttribute.
+ */
+typedef enum {
+  /**
+   * Number of event IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_ID                            = 1,
+  /**
+   * Number of event domain IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_DOMAIN_ID                     = 2,
+  /**
+   * Get global memory bandwidth in Kbytes/sec. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH                 = 3,
+  /**
+   * Get theoretical maximum number of instructions per cycle. Value
+   * is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_PER_CYCLE                   = 4,
+  /**
+   * Get theoretical maximum number of single precision instructions
+   * that can be executed per second. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_THROUGHPUT_SINGLE_PRECISION = 5,
+  /**
+   * Get number of frame buffers for device.  Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_FRAME_BUFFERS                       = 6,
+  /**
+   * Get PCIE link rate in Mega bits/sec for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_RATE                          = 7,
+  /**
+   * Get PCIE link width for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH                         = 8,
+  /**
+   * Get PCIE generation for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_GEN                                = 9,
+  /**
+   * Get the class for the device. Value is a
+   * CUpti_DeviceAttributeDeviceClass.
+   */
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS                            = 10,
+  /**
+   * Get the peak single precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE                       = 11,
+  /**
+   * Get the peak double precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE                       = 12,
+  /**
+   * Get number of L2 units. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_L2_UNITS                           = 13,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_SHARED
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_SHARED = 14,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_L1
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_L1 = 15,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_EQUAL
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_EQUAL = 16,
+  /**
+   * Get the peak half precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE                       = 17,
+  /**
+   * Check if Nvlink is connected to device. Returns 1, if at least one
+   * Nvlink is connected to the device, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVLINK_PRESENT                          = 18,
+    /**
+   * Check if Nvlink is present between GPU and CPU. Returns Bandwidth,
+   * in Bytes/sec, if Nvlink is present, returns 0 otherwise.
+   * Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW                       = 19,
+  /**
+   * Check if NVSwitch is present in the underlying topology.
+   * Returns 1, if present, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVSWITCH_PRESENT                        = 20,
+  CUPTI_DEVICE_ATTR_FORCE_INT                               = 0x7fffffff,
+} CUpti_DeviceAttribute;
+
+/**
+ * \brief Event domain attributes.
+ *
+ * Event domain attributes. Except where noted, all the attributes can
+ * be read using either \ref cuptiDeviceGetEventDomainAttribute or
+ * \ref cuptiEventDomainGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event domain name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_NAME                 = 0,
+  /**
+   * Number of instances of the domain for which event counts will be
+   * collected.  The domain may have additional instances that cannot
+   * be profiled (see CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT).
+   * Can be read only with \ref
+   * cuptiDeviceGetEventDomainAttribute. Value is a uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT       = 1,
+  /**
+   * Total number of instances of the domain, including instances that
+   * cannot be profiled.  Use CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT
+   * to get the number of instances that can be profiled. Can be read
+   * only with \ref cuptiDeviceGetEventDomainAttribute. Value is a
+   * uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT = 3,
+  /**
+   * Collection method used for events contained in the event domain.
+   * Value is a \ref CUpti_EventCollectionMethod.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_COLLECTION_METHOD    = 4,
+
+  CUPTI_EVENT_DOMAIN_ATTR_FORCE_INT      = 0x7fffffff,
+} CUpti_EventDomainAttribute;
+
+/**
+ * \brief The collection method used for an event.
+ *
+ * The collection method indicates how an event is collected.
+ */
+typedef enum {
+  /**
+   * Event is collected using a hardware global performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_PM                  = 0,
+  /**
+   * Event is collected using a hardware SM performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_SM                  = 1,
+  /**
+   * Event is collected using software instrumentation.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_INSTRUMENTED        = 2,
+  /**
+   * Event is collected using NvLink throughput counter method.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_NVLINK_TC           = 3,
+  CUPTI_EVENT_COLLECTION_METHOD_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMethod;
+
+/**
+ * \brief Event group attributes.
+ *
+ * Event group attributes. These attributes can be read using \ref
+ * cuptiEventGroupGetAttribute. Attributes marked [rw] can also be
+ * written using \ref cuptiEventGroupSetAttribute.
+ */
+typedef enum {
+  /**
+   * The domain to which the event group is bound. This attribute is
+   * set when the first event is added to the group.  Value is a
+   * CUpti_EventDomainID.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID              = 0,
+  /**
+   * [rw] Profile all the instances of the domain for this
+   * eventgroup. This feature can be used to get load balancing
+   * across all instances of a domain. Value is an integer.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES = 1,
+  /**
+   * [rw] Reserved for user data.
+   */
+  CUPTI_EVENT_GROUP_ATTR_USER_DATA                    = 2,
+  /**
+   * Number of events in the group. Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS                   = 3,
+  /**
+   * Enumerates events in the group. Value is a pointer to buffer of
+   * size sizeof(CUpti_EventID) * num_of_events in the eventgroup.
+   * num_of_events can be queried using
+   * CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENTS                       = 4,
+  /**
+   * Number of instances of the domain bound to this event group that
+   * will be counted.  Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT               = 5,
+  /**
+   * Event group scope can be set to CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT for an eventGroup, before
+   * adding any event.
+   * Sets the scope of eventgroup as CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT when the scope of the events
+   * that will be added is CUPTI_EVENT_PROFILING_SCOPE_BOTH.
+   * If profiling scope of event is either
+   * CUPTI_EVENT_PROFILING_SCOPE_DEVICE or CUPTI_EVENT_PROFILING_SCOPE_CONTEXT
+   * then setting this attribute will not affect the default scope.
+   * It is not allowed to add events of different scope to same eventgroup.
+   * Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILING_SCOPE               = 6,
+  CUPTI_EVENT_GROUP_ATTR_FORCE_INT                     = 0x7fffffff,
+} CUpti_EventGroupAttribute;
+
+/**
+* \brief Profiling scope for event.
+*
+* Profiling scope of event indicates if the event can be collected at context
+* scope or device scope or both i.e. it can be collected at any of context or
+* device scope.
+*/
+typedef enum {
+  /**
+   * Event is collected at context scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_CONTEXT                 = 0,
+  /**
+   * Event is collected at device scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_DEVICE                  = 1,
+  /**
+   * Event can be collected at device or context scope.
+   * The scope can be set using \ref cuptiEventGroupSetAttribute API.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_BOTH                    = 2,
+  CUPTI_EVENT_PROFILING_SCOPE_FORCE_INT               = 0x7fffffff
+} CUpti_EventProfilingScope;
+
+/**
+ * \brief Event attributes.
+ *
+ * Event attributes. These attributes can be read using \ref
+ * cuptiEventGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_ATTR_NAME              = 0,
+  /**
+   * Short description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of event. Value is CUpti_EventCategory.
+   */
+  CUPTI_EVENT_ATTR_CATEGORY          = 3,
+  /**
+   * Profiling scope of the events. It can be either device or context or both.
+   * Value is a \ref CUpti_EventProfilingScope.
+   */
+  CUPTI_EVENT_ATTR_PROFILING_SCOPE   = 5,
+
+  CUPTI_EVENT_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_EventAttribute;
+
+/**
+ * \brief Event collection modes.
+ *
+ * The event collection mode determines the period over which the
+ * events within the enabled event groups will be collected.
+ */
+typedef enum {
+  /**
+   * Events are collected for the entire duration between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls.
+   * Event values are reset when the events are read.
+   * For CUDA toolkit v6.0 and older this was the default mode.
+   */
+  CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS          = 0,
+  /**
+   * Events are collected only for the durations of kernel executions
+   * that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls. Event collection begins when a
+   * kernel execution begins, and stops when kernel execution
+   * completes. Event values are reset to zero when each kernel
+   * execution begins. If multiple kernel executions occur between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls then the
+   * event values must be read after each kernel launch if those
+   * events need to be associated with the specific kernel launch.
+   * Note that collection in this mode may significantly change the
+   * overall performance characteristics of the application because
+   * kernel executions that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls are serialized on the GPU.
+   * This is the default mode from CUDA toolkit v6.5
+   */
+  CUPTI_EVENT_COLLECTION_MODE_KERNEL              = 1,
+  CUPTI_EVENT_COLLECTION_MODE_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMode;
+
+/**
+ * \brief An event category.
+ *
+ * Each event is assigned to a category that represents the general
+ * type of the event. A event's category is accessed using \ref
+ * cuptiEventGetAttribute and the CUPTI_EVENT_ATTR_CATEGORY attribute.
+ */
+typedef enum {
+  /**
+   * An instruction related event.
+   */
+  CUPTI_EVENT_CATEGORY_INSTRUCTION     = 0,
+  /**
+   * A memory related event.
+   */
+  CUPTI_EVENT_CATEGORY_MEMORY          = 1,
+  /**
+   * A cache related event.
+   */
+  CUPTI_EVENT_CATEGORY_CACHE           = 2,
+  /**
+   * A profile-trigger event.
+   */
+  CUPTI_EVENT_CATEGORY_PROFILE_TRIGGER = 3,
+  /**
+   * A system event.
+   */
+  CUPTI_EVENT_CATEGORY_SYSTEM  = 4,
+  CUPTI_EVENT_CATEGORY_FORCE_INT       = 0x7fffffff
+} CUpti_EventCategory;
+
+/**
+ * \brief The overflow value for a CUPTI event.
+ *
+ * The CUPTI event value that indicates an overflow.
+ */
+#define CUPTI_EVENT_OVERFLOW ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
+
+/**
+ * \brief The value that indicates the event value is invalid
+ */
+#define CUPTI_EVENT_INVALID ((uint64_t)0xFFFFFFFFFFFFFFFEULL)
+
+/**
+ * \brief Flags for cuptiEventGroupReadEvent an
+ * cuptiEventGroupReadAllEvents.
+ *
+ * Flags for \ref cuptiEventGroupReadEvent an \ref
+ * cuptiEventGroupReadAllEvents.
+ */
+typedef enum {
+  /**
+   * No flags.
+   */
+  CUPTI_EVENT_READ_FLAG_NONE          = 0,
+  CUPTI_EVENT_READ_FLAG_FORCE_INT     = 0x7fffffff,
+} CUpti_ReadEventFlags;
+
+
+/**
+ * \brief A set of event groups.
+ *
+ * A set of event groups. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a set indicates that event groups that can be enabled at the same
+ * time (i.e. all the events in the set can be collected
+ * simultaneously).
+ */
+typedef struct {
+  /**
+   * The number of event groups in the set.
+   */
+  uint32_t numEventGroups;
+  /**
+   * An array of \p numEventGroups event groups.
+   */
+  CUpti_EventGroup *eventGroups;
+} CUpti_EventGroupSet;
+
+/**
+ * \brief A set of event group sets.
+ *
+ * A set of event group sets. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a CUpti_EventGroupSets indicates the number of passes required to
+ * collect all the events, and the event groups that should be
+ * collected during each pass.
+ */
+typedef struct {
+  /**
+   * Number of event group sets.
+   */
+  uint32_t numSets;
+  /**
+   * An array of \p numSets event group sets.
+   */
+  CUpti_EventGroupSet *sets;
+} CUpti_EventGroupSets;
+
+/**
+ * \brief Set the event collection mode.
+ *
+ * Set the event collection mode for a \p context.  The \p mode
+ * controls the event collection behavior of all events in event
+ * groups created in the \p context. This API is invalid in kernel
+ * replay mode.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \param mode The event collection mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_OPERATION if called when replay mode is enabled
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if mode is not supported on the device
+ */
+
+CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context,
+                                                 CUpti_EventCollectionMode mode);
+
+/**
+ * \brief Read a device attribute.
+ *
+ * Read a device attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a device attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
+                                             CUpti_DeviceAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Get the number of domains for a device.
+ *
+ * Returns the number of domains in \p numDomains for a device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
+                                                   uint32_t *numDomains);
+
+/**
+ * \brief Get the event domains for a device.
+ *
+ * Returns the event domains IDs in \p domainArray for a device.  The
+ * size of the \p domainArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p domainArray buffer must be at
+ * least \p numdomains * sizeof(CUpti_EventDomainID) or else all
+ * domains will not be returned. The value returned in \p
+ * *arraySizeBytes contains the number of bytes returned in \p
+ * domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns the IDs of the event domains for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device,
+                                                 size_t *arraySizeBytes,
+                                                 CUpti_EventDomainID *domainArray);
+
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device,
+                                                        CUpti_EventDomainID eventDomain,
+                                                        CUpti_EventDomainAttribute attrib,
+                                                        size_t *valueSize,
+                                                        void *value);
+
+/**
+ * \brief Get the number of event domains available on any device.
+ *
+ * Returns the total number of event domains available on any
+ * CUDA-capable device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains);
+
+/**
+ * \brief Get the event domains available on any device.
+ *
+ * Returns all the event domains available on any CUDA-capable device.
+ * Event domain IDs are returned in \p domainArray. The size of the \p
+ * domainArray buffer is given by \p *arraySizeBytes. The size of the
+ * \p domainArray buffer must be at least \p numDomains *
+ * sizeof(CUpti_EventDomainID) or all domains will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns all the event domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
+                                           CUpti_EventDomainID *domainArray);
+
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain,
+                                                  CUpti_EventDomainAttribute attrib,
+                                                  size_t *valueSize,
+                                                  void *value);
+
+/**
+ * \brief Get number of events in a domain.
+ *
+ * Returns the number of events in \p numEvents for a domain.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param numEvents Returns the number of events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain,
+                                                  uint32_t *numEvents);
+
+/**
+ * \brief Get the events in a domain.
+ *
+ * Returns the event IDs in \p eventArray for a domain.  The size of
+ * the \p eventArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p eventArray buffer must be at least \p numdomainevents *
+ * sizeof(CUpti_EventID) or else all events will not be returned. The
+ * value returned in \p *arraySizeBytes contains the number of bytes
+ * returned in \p eventArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param arraySizeBytes The size of \p eventArray in bytes, and
+ * returns the number of bytes written to \p eventArray
+ * \param eventArray Returns the IDs of the events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or \p
+ * eventArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
+                                                size_t *arraySizeBytes,
+                                                CUpti_EventID *eventArray);
+
+/**
+ * \brief Get an event attribute.
+ *
+ * Returns an event attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param event ID of the event
+ * \param attrib The event attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
+                                            CUpti_EventAttribute attrib,
+                                            size_t *valueSize,
+                                            void *value);
+
+/**
+ * \brief Find an event by name.
+ *
+ * Find an event by name and return the event ID in \p *event.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventName The name of the event to find
+ * \param event Returns the ID of the found event or undefined if
+ * unable to find the event
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_NAME if unable to find an event
+ * with name \p eventName. In this case \p *event is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventName or \p event are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
+                                             const char *eventName,
+                                             CUpti_EventID *event);
+
+/**
+ * \brief Create a new event group for a context.
+ *
+ * Creates a new event group for \p context and returns the new group
+ * in \p *eventGroup.
+ * \note \p flags are reserved for future use and should be set to zero.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context for the event group
+ * \param eventGroup Returns the new event group
+ * \param flags Reserved - must be zero
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
+                                           CUpti_EventGroup *eventGroup,
+                                           uint32_t flags);
+
+/**
+ * \brief Destroy an event group.
+ *
+ * Destroy an \p eventGroup and free its resources. An event group
+ * cannot be destroyed if it is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if the event group is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Read an event group attribute.
+ *
+ * Read an event group attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an eventgroup attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t *valueSize,
+                                                 void *value);
+
+/**
+ * \brief Write an event group attribute.
+ *
+ * Write an event group attribute.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event group attribute, or if
+ * \p attrib is not a writable attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t valueSize,
+                                                 void *value);
+
+/**
+ * \brief Add an event to an event group.
+ *
+ * Add an event to an event group. The event add can fail for a number of reasons:
+ * \li The event group is enabled
+ * \li The event does not belong to the same event domain as the
+ * events that are already in the event group
+ * \li Device limitations on the events that can belong to the same group
+ * \li The event group is full
+ *
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to add to the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p event belongs to a
+ * different event domain than the events already in \p eventGroup, or
+ * if a device limitation prevents \p event from being collected at
+ * the same time as the events already in \p eventGroup
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if \p eventGroup is full
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
+                                             CUpti_EventID event);
+
+/**
+ * \brief Remove an event from an event group.
+ *
+ * Remove \p event from the an event group. The event cannot be
+ * removed if the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to remove from the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
+                                                CUpti_EventID event);
+
+/**
+ * \brief Remove all events from an event group.
+ *
+ * Remove all events from an event group. Events cannot be removed if
+ * the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Zero all the event counts in an event group.
+ *
+ * Zero all the event counts in an event group.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Enable an event group.
+ *
+ * Enable an event group. Enabling an event group zeros the value of
+ * all the events in the group and then starts collection of those
+ * events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if another client is profiling
+ * and hardware is busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Disable an event group.
+ *
+ * Disable an event group. Disabling an event group stops collection
+ * of events contained in the group.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup);
+
+/**
+ * \brief Read the value for an event in an event group.
+ *
+ * Read the value for an event in an event group. The event value is
+ * returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of the \p
+ * eventValueBuffer buffer. The buffer must be at least sizeof(uint64)
+ * if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set
+ * on the group containing the event.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is set on the
+ * group.
+ *
+ * If any instance of an event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading an event from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param event The event to read
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer
+ * in bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event value(s)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes or \p eventValueBuffer is NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
+                                              CUpti_ReadEventFlags flags,
+                                              CUpti_EventID event,
+                                              size_t *eventValueBufferSizeBytes,
+                                              uint64_t *eventValueBuffer);
+
+/**
+ * \brief Read the values for all the events in an event group.
+ *
+ * Read the values for all the events in an event group. The event
+ * values are returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of \p
+ * eventValueBuffer.  The buffer must be at least (sizeof(uint64) *
+ * number of events in group) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set on
+ * the group containing the events.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances * number of events in
+ * group) if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is
+ * set on the group.
+ *
+ * The data format returned in \p eventValueBuffer is:
+ *    - domain instance 0: event0 event1 ... eventN
+ *    - domain instance 1: event0 event1 ... eventN
+ *    - ...
+ *    - domain instance M: event0 event1 ... eventN
+ *
+ * The event order in \p eventValueBuffer is returned in \p
+ * eventIdArray. The size of \p eventIdArray is specified in \p
+ * eventIdArraySizeBytes. The size should be at least
+ * (sizeof(CUpti_EventID) * number of events in group).
+ *
+ * If any instance of any event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading events from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer in
+ * bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event values
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events in the same order
+ * as the values return in eventValueBuffer.
+ * \param numEventIdsRead Returns the number of event IDs returned
+ * in \p eventIdArray
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes, \p eventValueBuffer, \p
+ * eventIdArraySizeBytes, \p eventIdArray or \p numEventIdsRead is
+ * NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * or \p eventIdArray is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup       eventGroup,
+                                                  CUpti_ReadEventFlags   flags,
+                                                  size_t                 *eventValueBufferSizeBytes,
+                                                  uint64_t               *eventValueBuffer,
+                                                  size_t                 *eventIdArraySizeBytes,
+                                                  CUpti_EventID          *eventIdArray,
+                                                  size_t                 *numEventIdsRead);
+
+/**
+ * \brief For a set of events, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events.
+ *
+ * The number of events that can be collected simultaneously varies by
+ * device and by the type of the events. When events can be collected
+ * simultaneously, they may need to be grouped into multiple event
+ * groups because they are from different event domains. This function
+ * takes a set of events and determines how many passes are required
+ * to collect all those events, and which events can be collected
+ * simultaneously in each pass.
+ *
+ * The CUpti_EventGroupSets returned in \p eventGroupPasses indicates
+ * how many passes are required to collect the events with the \p
+ * numSets field. Within each event group set, the \p sets array
+ * indicates the event groups that should be collected on each pass.
+ * \note \b Thread-safety: this function is thread safe, but client
+ * must guard against another thread simultaneously destroying \p
+ * context.
+ *
+ * \param context The context for event collection
+ * \param eventIdArraySizeBytes Size of \p eventIdArray in bytes
+ * \param eventIdArray Array of event IDs that need to be grouped
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context,
+                                               size_t eventIdArraySizeBytes,
+                                               CUpti_EventID *eventIdArray,
+                                               CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Destroy a event group sets object.
+ *
+ * Destroy a CUpti_EventGroupSets object.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSets The object to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if any of the event groups
+ * contained in the sets is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSets is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets);
+
+
+/**
+ * \brief Enable an event group set.
+ *
+ * Enable a set of event groups. Enabling a set of event groups zeros the value of
+ * all the events in all the groups and then starts collection of those events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if other client is profiling and hardware is
+ * busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet);
+
+/**
+ * \brief Disable an event group set.
+ *
+ * Disable a set of event groups. Disabling a set of event groups
+ * stops collection of events contained in the groups.
+ * \note \b Thread-safety: this function is thread safe.
+ * \note \b If this call fails, some of the event groups in the set may be disabled
+ * and other event groups may remain enabled.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet);
+
+/**
+ * \brief Enable kernel replay mode.
+ *
+ * Set profiling mode for the context to replay mode. In this mode,
+ * any number of events can be collected in one run of the kernel. The
+ * event collection mode will automatically switch to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  In this mode, \ref
+ * cuptiSetEventCollectionMode will return
+ * CUPTI_ERROR_INVALID_OPERATION.
+ * \note \b Kernels might take longer to run if many events are enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context);
+
+/**
+ * \brief Disable kernel replay mode.
+ *
+ * Set profiling mode for the context to non-replay (default)
+ * mode. Event collection mode will be set to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  All previously enabled
+ * event groups and event group sets will be disabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context);
+
+/**
+ * \brief Function type for getting updates on kernel replay.
+ *
+ * \param kernelName The mangled kernel name
+ * \param numReplaysDone Number of replays done so far
+ * \param customData Pointer of any custom data passed in when subscribing
+ */
+typedef void (CUPTIAPI *CUpti_KernelReplayUpdateFunc)(
+    const char *kernelName,
+    int numReplaysDone,
+    void *customData);
+
+/**
+ * \brief Subscribe to kernel replay updates.
+ *
+ * When subscribed, the function pointer passed in will be called each time a
+ * kernel run is finished during kernel replay. Previously subscribed function
+ * pointer will be replaced. Pass in NULL as the function pointer unsubscribes
+ * the update.
+ *
+ * \param updateFunc The update function pointer
+ * \param customData Pointer to any custom data
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData);
+
+/** @} */ /* END CUPTI_EVENT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_EVENTS_H_*/
+
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d441e6b51a1be18f22a018800316fda0a779ec
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_metrics.h
@@ -0,0 +1,825 @@
+/*
+ * Copyright 2011-2020   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_METRIC_H_)
+#define _CUPTI_METRIC_H_
+
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_METRIC_API CUPTI Metric API
+ * Functions, types, and enums that implement the CUPTI Metric API.
+ *
+ * \note CUPTI metric API from the header cupti_metrics.h are not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * These API will be deprecated in a future CUDA release. These are replaced by
+ * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+ * in the headers nvperf_host.h and nvperf_target.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU
+ * architectures).
+ *
+ * @{
+ */
+
+/**
+ * \brief ID for a metric.
+ *
+ * A metric provides a measure of some aspect of the device.
+ */
+typedef uint32_t CUpti_MetricID;
+
+/**
+ * \brief A metric category.
+ *
+ * Each metric is assigned to a category that represents the general
+ * type of the metric. A metric's category is accessed using \ref
+ * cuptiMetricGetAttribute and the CUPTI_METRIC_ATTR_CATEGORY
+ * attribute.
+ */
+typedef enum {
+  /**
+   * A memory related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MEMORY          = 0,
+  /**
+   * An instruction related metric.
+   */
+  CUPTI_METRIC_CATEGORY_INSTRUCTION     = 1,
+  /**
+   * A multiprocessor related metric.
+   */
+  CUPTI_METRIC_CATEGORY_MULTIPROCESSOR  = 2,
+  /**
+   * A cache related metric.
+   */
+  CUPTI_METRIC_CATEGORY_CACHE           = 3,
+  /**
+   * A texture related metric.
+   */
+  CUPTI_METRIC_CATEGORY_TEXTURE         = 4,
+  /**
+   *A Nvlink related metric.
+   */
+  CUPTI_METRIC_CATEGORY_NVLINK          = 5,
+  /**
+   *A PCIe related metric.
+   */
+  CUPTI_METRIC_CATEGORY_PCIE           = 6,
+  CUPTI_METRIC_CATEGORY_FORCE_INT                         = 0x7fffffff,
+} CUpti_MetricCategory;
+
+/**
+ * \brief A metric evaluation mode.
+ *
+ * A metric can be evaluated per hardware instance to know the load balancing
+ * across instances of a domain or the metric can be evaluated in aggregate mode
+ * when the events involved in metric evaluation are from different event
+ * domains. It might be possible to evaluate some metrics in both
+ * modes for convenience. A metric's evaluation mode is accessed using \ref
+ * CUpti_MetricEvaluationMode and the CUPTI_METRIC_ATTR_EVALUATION_MODE
+ * attribute.
+ */
+typedef enum {
+  /**
+   * If this bit is set, the metric can be profiled for each instance of the
+   * domain. The event values passed to \ref cuptiMetricGetValue can contain
+   * values for one instance of the domain. And \ref cuptiMetricGetValue can
+   * be called for each instance.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE         = 1,
+  /**
+   * If this bit is set, the metric can be profiled over all instances. The
+   * event values passed to \ref cuptiMetricGetValue can be aggregated values
+   * of events for all instances of the domain.
+   */
+  CUPTI_METRIC_EVALUATION_MODE_AGGREGATE            = 1 << 1,
+  CUPTI_METRIC_EVALUATION_MODE_FORCE_INT            = 0x7fffffff,
+} CUpti_MetricEvaluationMode;
+
+/**
+ * \brief Kinds of metric values.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef enum {
+  /**
+   * The metric value is a 64-bit double.
+   */
+  CUPTI_METRIC_VALUE_KIND_DOUBLE            = 0,
+  /**
+   * The metric value is a 64-bit unsigned integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_UINT64            = 1,
+  /**
+   * The metric value is a percentage represented by a 64-bit
+   * double. For example, 57.5% is represented by the value 57.5.
+   */
+  CUPTI_METRIC_VALUE_KIND_PERCENT           = 2,
+  /**
+   * The metric value is a throughput represented by a 64-bit
+   * integer. The unit for throughput values is bytes/second.
+   */
+  CUPTI_METRIC_VALUE_KIND_THROUGHPUT        = 3,
+  /**
+   * The metric value is a 64-bit signed integer.
+   */
+  CUPTI_METRIC_VALUE_KIND_INT64             = 4,
+  /**
+   * The metric value is a utilization level, as represented by
+   * CUpti_MetricValueUtilizationLevel.
+   */
+  CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL = 5,
+
+  CUPTI_METRIC_VALUE_KIND_FORCE_INT  = 0x7fffffff
+} CUpti_MetricValueKind;
+
+/**
+ * \brief Enumeration of utilization levels for metrics values of kind
+ * CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL. Utilization values can
+ * vary from IDLE (0) to MAX (10) but the enumeration only provides
+ * specific names for a few values.
+ */
+typedef enum {
+  CUPTI_METRIC_VALUE_UTILIZATION_IDLE      = 0,
+  CUPTI_METRIC_VALUE_UTILIZATION_LOW       = 2,
+  CUPTI_METRIC_VALUE_UTILIZATION_MID       = 5,
+  CUPTI_METRIC_VALUE_UTILIZATION_HIGH      = 8,
+  CUPTI_METRIC_VALUE_UTILIZATION_MAX       = 10,
+  CUPTI_METRIC_VALUE_UTILIZATION_FORCE_INT = 0x7fffffff
+} CUpti_MetricValueUtilizationLevel;
+
+/**
+ * \brief Metric attributes.
+ *
+ * Metric attributes describe properties of a metric. These attributes
+ * can be read using \ref cuptiMetricGetAttribute.
+ */
+typedef enum {
+  /**
+   * Metric name. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_NAME              = 0,
+  /**
+   * Short description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of metric. Value is a null terminated const c-string.
+   */
+  CUPTI_METRIC_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of the metric. Value is of type CUpti_MetricCategory.
+   */
+  CUPTI_METRIC_ATTR_CATEGORY          = 3,
+  /**
+   * Value type of the metric. Value is of type CUpti_MetricValueKind.
+   */
+  CUPTI_METRIC_ATTR_VALUE_KIND          = 4,
+  /**
+   * Metric evaluation mode. Value is of type CUpti_MetricEvaluationMode.
+   */
+  CUPTI_METRIC_ATTR_EVALUATION_MODE     = 5,
+  CUPTI_METRIC_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_MetricAttribute;
+
+/**
+ * \brief A metric value.
+ *
+ * Metric values can be one of several different kinds. Corresponding
+ * to each kind is a member of the CUpti_MetricValue union. The metric
+ * value returned by \ref cuptiMetricGetValue should be accessed using
+ * the appropriate member of that union based on its value kind.
+ */
+typedef union {
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_DOUBLE.
+   */
+  double metricValueDouble;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UINT64.
+   */
+  uint64_t metricValueUint64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_INT64.
+   */
+  int64_t metricValueInt64;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_PERCENT. For example, 57.5% is
+   * represented by the value 57.5.
+   */
+  double metricValuePercent;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_THROUGHPUT.  The unit for
+   * throughput values is bytes/second.
+   */
+  uint64_t metricValueThroughput;
+  /*
+   * Value for CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL.
+   */
+  CUpti_MetricValueUtilizationLevel metricValueUtilizationLevel;
+} CUpti_MetricValue;
+
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for metric property
+ * CUPTI_METRIC_PROPERTY_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TESLA          = 0,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_QUADRO         = 1,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_GEFORCE        = 2,
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS_TEGRA          = 3,
+} CUpti_MetricPropertyDeviceClass;
+
+/**
+ * \brief Metric device properties.
+ *
+ * Metric device properties describe device properties which are needed for a metric.
+ * Some of these properties can be collected using cuDeviceGetAttribute.
+ */
+typedef enum {
+  /*
+   * Number of multiprocessors on a device.  This can be collected
+   * using value of \param CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_MULTIPROCESSOR_COUNT,
+  /*
+   * Maximum number of warps on a multiprocessor. This can be
+   * collected using ratio of value of \param
+   * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR and \param
+   * CU_DEVICE_ATTRIBUTE_WARP_SIZE of cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_WARPS_PER_MULTIPROCESSOR,
+  /*
+   * GPU Time for kernel in ns. This should be profiled using CUPTI
+   * Activity API.
+   */
+  CUPTI_METRIC_PROPERTY_KERNEL_GPU_TIME,
+  /*
+   * Clock rate for device in KHz.  This should be collected using
+   * value of \param CU_DEVICE_ATTRIBUTE_CLOCK_RATE of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_CLOCK_RATE,
+  /*
+   * Number of Frame buffer units for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTRIBUTE_MAX_FRAME_BUFFERS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FRAME_BUFFER_COUNT,
+  /*
+   * Global memory bandwidth in KBytes/sec. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH
+   * of cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GLOBAL_MEMORY_BANDWIDTH,
+  /*
+   * PCIE link rate in Mega bits/sec. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_RATE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_RATE,
+  /*
+   * PCIE link width for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_LINK_WIDTH,
+  /*
+   * PCIE generation for device. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_PCIE_GEN of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_PCIE_GEN,
+  /*
+   * The device class. This should be collected using
+   * value of \param CUPTI_DEVICE_ATTR_DEVICE_CLASS of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_DEVICE_CLASS,
+  /*
+   * Peak single precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_SP_PER_CYCLE,
+  /*
+   * Peak double precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_DP_PER_CYCLE,
+  /*
+   * Number of L2 units on a device. This can be collected
+   * using value of \param CUPTI_DEVICE_ATTR_MAX_L2_UNITS of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_L2_UNITS,
+  /*
+   * Whether ECC support is enabled on the device. This can be
+   * collected using value of \param CU_DEVICE_ATTRIBUTE_ECC_ENABLED of
+   * cuDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_ECC_ENABLED,
+  /*
+   * Peak half precision floating point operations that
+   * can be performed in one cycle by the device.
+   * This should be collected using value of
+   * \param CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_FLOP_HP_PER_CYCLE,
+  /*
+   * NVLINK Bandwitdh for device. This should be collected
+   * using value of \param CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW of
+   * cuptiDeviceGetAttribute.
+   */
+  CUPTI_METRIC_PROPERTY_GPU_CPU_NVLINK_BANDWIDTH,
+} CUpti_MetricPropertyID;
+
+/**
+ * \brief Get the total number of metrics available on any device.
+ *
+ * Returns the total number of metrics available on any CUDA-capable
+ * devices.
+ *
+ * \param numMetrics Returns the number of metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+*/
+CUptiResult CUPTIAPI cuptiGetNumMetrics(uint32_t *numMetrics);
+
+/**
+ * \brief Get all the metrics available on any device.
+ *
+ * Returns the metric IDs in \p metricArray for all CUDA-capable
+ * devices.  The size of the \p metricArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p metricArray buffer must be at
+ * least \p numMetrics * sizeof(CUpti_MetricID) or all metric IDs will
+ * not be returned. The value returned in \p *arraySizeBytes contains
+ * the number of bytes returned in \p metricArray.
+ *
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+*/
+CUptiResult CUPTIAPI cuptiEnumMetrics(size_t *arraySizeBytes,
+                                      CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get the number of metrics for a device.
+ *
+ * Returns the number of metrics available for a device.
+ *
+ * \param device The CUDA device
+ * \param numMetrics Returns the number of metrics available for the
+ * device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numMetrics is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumMetrics(CUdevice device,
+                                              uint32_t *numMetrics);
+
+/**
+ * \brief Get the metrics for a device.
+ *
+ * Returns the metric IDs in \p metricArray for a device.  The size of
+ * the \p metricArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p metricArray buffer must be at least \p numMetrics *
+ * sizeof(CUpti_MetricID) or else all metric IDs will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p metricArray.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p metricArray in bytes, and
+ * returns the number of bytes written to \p metricArray
+ * \param metricArray Returns the IDs of the metrics for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p metricArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumMetrics(CUdevice device,
+                                            size_t *arraySizeBytes,
+                                            CUpti_MetricID *metricArray);
+
+/**
+ * \brief Get a metric attribute.
+ *
+ * Returns a metric attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ *
+ * \param metric ID of the metric
+ * \param attrib The metric attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a metric attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetAttribute(CUpti_MetricID metric,
+                                             CUpti_MetricAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+
+/**
+ * \brief Find an metric by name.
+ *
+ * Find a metric by name and return the metric ID in \p *metric.
+ *
+ * \param device The CUDA device
+ * \param metricName The name of metric to find
+ * \param metric Returns the ID of the found metric or undefined if
+ * unable to find the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_METRIC_NAME if unable to find a metric
+ * with name \p metricName. In this case \p *metric is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricName or \p
+ * metric are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricGetIdFromName(CUdevice device,
+                                              const char *metricName,
+                                              CUpti_MetricID *metric);
+
+/**
+ * \brief Get number of events required to calculate a metric.
+ *
+ * Returns the number of events in \p numEvents that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numEvents Returns the number of events required for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumEvents(CUpti_MetricID metric,
+                                             uint32_t *numEvents);
+
+/**
+ * \brief Get the events required to calculating a metric.
+ *
+ * Gets the event IDs in \p eventIdArray required to calculate a \p
+ * metric. The size of the \p eventIdArray buffer is given by \p
+ * *eventIdArraySizeBytes and must be at least \p numEvents *
+ * sizeof(CUpti_EventID) or all events will not be returned. The value
+ * returned in \p *eventIdArraySizeBytes contains the number of bytes
+ * returned in \p eventIdArray.
+ *
+ * \param metric ID of the metric
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArraySizeBytes or \p
+ * eventIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumEvents(CUpti_MetricID metric,
+                                           size_t *eventIdArraySizeBytes,
+                                           CUpti_EventID *eventIdArray);
+
+/**
+ * \brief Get number of properties required to calculate a metric.
+ *
+ * Returns the number of properties in \p numProp that are required to
+ * calculate a metric.
+ *
+ * \param metric ID of the metric
+ * \param numProp Returns the number of properties required for the
+ * metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numProp is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetNumProperties(CUpti_MetricID metric,
+                                                 uint32_t *numProp);
+
+/**
+ * \brief Get the properties required to calculating a metric.
+ *
+ * Gets the property IDs in \p propIdArray required to calculate a \p
+ * metric. The size of the \p propIdArray buffer is given by \p
+ * *propIdArraySizeBytes and must be at least \p numProp *
+ * sizeof(CUpti_DeviceAttribute) or all properties will not be
+ * returned. The value returned in \p *propIdArraySizeBytes contains
+ * the number of bytes returned in \p propIdArray.
+ *
+ * \param metric ID of the metric
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes,
+ * and returns the number of bytes written to \p propIdArray
+ * \param propIdArray Returns the IDs of the properties required to
+ * calculate \p metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p propIdArraySizeBytes or \p
+ * propIdArray are NULL.
+ */
+CUptiResult CUPTIAPI cuptiMetricEnumProperties(CUpti_MetricID metric,
+                                               size_t *propIdArraySizeBytes,
+                                               CUpti_MetricPropertyID *propIdArray);
+
+
+/**
+ * \brief For a metric get the groups of events that must be collected
+ * in the same pass.
+ *
+ * For a metric get the groups of events that must be collected in the
+ * same pass to ensure that the metric is calculated correctly. If the
+ * events are not collected as specified then the metric value may be
+ * inaccurate.
+ *
+ * The function returns NULL if a metric does not have any required
+ * event group. In this case the events needed for the metric can be
+ * grouped in any manner for collection.
+ *
+ * \param context The context for event collection
+ * \param metric The metric ID
+ * \param eventGroupSets Returns a CUpti_EventGroupSets object that
+ * indicates the events that must be collected in the same pass to
+ * ensure the metric is calculated correctly.  Returns NULL if no
+ * grouping is required for metric
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ */
+CUptiResult CUPTIAPI cuptiMetricGetRequiredEventGroupSets(CUcontext context,
+                                                          CUpti_MetricID metric,
+                                                          CUpti_EventGroupSets **eventGroupSets);
+
+/**
+ * \brief For a set of metrics, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events required for those metrics.
+ *
+ * For a set of metrics, get the grouping that indicates the number of
+ * passes and the event groups necessary to collect the events
+ * required for those metrics.
+ *
+ * \see cuptiEventGroupSetsCreate for details on event group set
+ * creation.
+ *
+ * \param context The context for event collection
+ * \param metricIdArraySizeBytes Size of the metricIdArray in bytes
+ * \param metricIdArray Array of metric IDs
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricCreateEventGroupSets(CUcontext context,
+                                                     size_t metricIdArraySizeBytes,
+                                                     CUpti_MetricID *metricIdArray,
+                                                     CUpti_EventGroupSets **eventGroupPasses);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events collected for a metric to calculate the metric
+ * value. Metric value evaluation depends on the evaluation mode
+ * \ref CUpti_MetricEvaluationMode that the metric supports.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE,
+ * then it assumes that the input event value is for one domain instance.
+ * If a metric has evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE,
+ * it assumes that input event values are
+ * normalized to represent all domain instances on a device. For the
+ * most accurate metric collection, the events required for the metric
+ * should be collected for all profiled domain instances. For example,
+ * to collect all instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param device The CUDA device that the metric is being calculated for
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param timeDuration The duration over which the events were
+ * collected, in ns
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_INVALID_METRIC_VALUE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue(CUdevice device,
+                                         CUpti_MetricID metric,
+                                         size_t eventIdArraySizeBytes,
+                                         CUpti_EventID *eventIdArray,
+                                         size_t eventValueArraySizeBytes,
+                                         uint64_t *eventValueArray,
+                                         uint64_t timeDuration,
+                                         CUpti_MetricValue *metricValue);
+
+/**
+ * \brief Calculate the value for a metric.
+ *
+ * Use the events and properties collected for a metric to calculate
+ * the metric value. Metric value evaluation depends on the evaluation
+ * mode \ref CUpti_MetricEvaluationMode that the metric supports.  If
+ * a metric has evaluation mode as
+ * CUPTI_METRIC_EVALUATION_MODE_PER_INSTANCE, then it assumes that the
+ * input event value is for one domain instance.  If a metric has
+ * evaluation mode as CUPTI_METRIC_EVALUATION_MODE_AGGREGATE, it
+ * assumes that input event values are normalized to represent all
+ * domain instances on a device. For the most accurate metric
+ * collection, the events required for the metric should be collected
+ * for all profiled domain instances. For example, to collect all
+ * instances of an event, set the
+ * CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES attribute on
+ * the group containing the event to 1. The normalized value for the
+ * event is then: (\p sum_event_values * \p totalInstanceCount) / \p
+ * instanceCount, where \p sum_event_values is the summation of the
+ * event values across all profiled domain instances, \p
+ * totalInstanceCount is obtained from querying
+ * CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT and \p instanceCount
+ * is obtained from querying CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT (or
+ * CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT).
+ *
+ * \param metric The metric ID
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes
+ * \param eventIdArray The event IDs required to calculate \p metric
+ * \param eventValueArraySizeBytes The size of \p eventValueArray in bytes
+ * \param eventValueArray The normalized event values required to
+ * calculate \p metric. The values must be order to match the order of
+ * events in \p eventIdArray
+ * \param propIdArraySizeBytes The size of \p propIdArray in bytes
+ * \param propIdArray The metric property IDs required to calculate \p metric
+ * \param propValueArraySizeBytes The size of \p propValueArray in bytes
+ * \param propValueArray The metric property values required to
+ * calculate \p metric. The values must be order to match the order of
+ * metric properties in \p propIdArray
+ * \param metricValue Returns the value for the metric
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_METRIC_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if the
+ * eventIdArray does not contain all the events needed for metric
+ * \retval CUPTI_ERROR_INVALID_EVENT_VALUE if any of the
+ * event values required for the metric is CUPTI_EVENT_OVERFLOW
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if the computed metric value
+ * cannot be represented in the metric's value type. For example,
+ * if the metric value type is unsigned and the computed metric value is negative
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p metricValue,
+ * \p eventIdArray or \p eventValueArray is NULL
+ */
+CUptiResult CUPTIAPI cuptiMetricGetValue2(CUpti_MetricID metric,
+                                          size_t eventIdArraySizeBytes,
+                                          CUpti_EventID *eventIdArray,
+                                          size_t eventValueArraySizeBytes,
+                                          uint64_t *eventValueArray,
+                                          size_t propIdArraySizeBytes,
+                                          CUpti_MetricPropertyID *propIdArray,
+                                          size_t propValueArraySizeBytes,
+                                          uint64_t *propValueArray,
+                                          CUpti_MetricValue *metricValue);
+
+/** @} */ /* END CUPTI_METRIC_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_METRIC_H_*/
+
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_nvtx_cbid.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_nvtx_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ad8c85e6e674b9a016580be88d3c5a2d2619990
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_nvtx_cbid.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2013-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+typedef enum {
+  CUPTI_CBID_NVTX_INVALID                               = 0,
+  CUPTI_CBID_NVTX_nvtxMarkA                             = 1,
+  CUPTI_CBID_NVTX_nvtxMarkW                             = 2,
+  CUPTI_CBID_NVTX_nvtxMarkEx                            = 3,
+  CUPTI_CBID_NVTX_nvtxRangeStartA                       = 4,
+  CUPTI_CBID_NVTX_nvtxRangeStartW                       = 5,
+  CUPTI_CBID_NVTX_nvtxRangeStartEx                      = 6,
+  CUPTI_CBID_NVTX_nvtxRangeEnd                          = 7,
+  CUPTI_CBID_NVTX_nvtxRangePushA                        = 8,
+  CUPTI_CBID_NVTX_nvtxRangePushW                        = 9,
+  CUPTI_CBID_NVTX_nvtxRangePushEx                       = 10,
+  CUPTI_CBID_NVTX_nvtxRangePop                          = 11,
+  CUPTI_CBID_NVTX_nvtxNameCategoryA                     = 12,
+  CUPTI_CBID_NVTX_nvtxNameCategoryW                     = 13,
+  CUPTI_CBID_NVTX_nvtxNameOsThreadA                     = 14,
+  CUPTI_CBID_NVTX_nvtxNameOsThreadW                     = 15,
+  CUPTI_CBID_NVTX_nvtxNameCuDeviceA                     = 16,
+  CUPTI_CBID_NVTX_nvtxNameCuDeviceW                     = 17,
+  CUPTI_CBID_NVTX_nvtxNameCuContextA                    = 18,
+  CUPTI_CBID_NVTX_nvtxNameCuContextW                    = 19,
+  CUPTI_CBID_NVTX_nvtxNameCuStreamA                     = 20,
+  CUPTI_CBID_NVTX_nvtxNameCuStreamW                     = 21,
+  CUPTI_CBID_NVTX_nvtxNameCuEventA                      = 22,
+  CUPTI_CBID_NVTX_nvtxNameCuEventW                      = 23,
+  CUPTI_CBID_NVTX_nvtxNameCudaDeviceA                   = 24,
+  CUPTI_CBID_NVTX_nvtxNameCudaDeviceW                   = 25,
+  CUPTI_CBID_NVTX_nvtxNameCudaStreamA                   = 26,
+  CUPTI_CBID_NVTX_nvtxNameCudaStreamW                   = 27,
+  CUPTI_CBID_NVTX_nvtxNameCudaEventA                    = 28,
+  CUPTI_CBID_NVTX_nvtxNameCudaEventW                    = 29,
+  CUPTI_CBID_NVTX_nvtxDomainMarkEx                      = 30,
+  CUPTI_CBID_NVTX_nvtxDomainRangeStartEx                = 31,
+  CUPTI_CBID_NVTX_nvtxDomainRangeEnd                    = 32,
+  CUPTI_CBID_NVTX_nvtxDomainRangePushEx                 = 33,
+  CUPTI_CBID_NVTX_nvtxDomainRangePop                    = 34,
+  CUPTI_CBID_NVTX_nvtxDomainResourceCreate              = 35,
+  CUPTI_CBID_NVTX_nvtxDomainResourceDestroy             = 36,
+  CUPTI_CBID_NVTX_nvtxDomainNameCategoryA               = 37,
+  CUPTI_CBID_NVTX_nvtxDomainNameCategoryW               = 38,
+  CUPTI_CBID_NVTX_nvtxDomainRegisterStringA             = 39,
+  CUPTI_CBID_NVTX_nvtxDomainRegisterStringW             = 40,
+  CUPTI_CBID_NVTX_nvtxDomainCreateA                     = 41,
+  CUPTI_CBID_NVTX_nvtxDomainCreateW                     = 42,
+  CUPTI_CBID_NVTX_nvtxDomainDestroy                     = 43,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserCreate              = 44,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserDestroy             = 45,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireStart        = 46,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireFailed       = 47,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserAcquireSuccess      = 48,
+  CUPTI_CBID_NVTX_nvtxDomainSyncUserReleasing           = 49,
+  CUPTI_CBID_NVTX_SIZE,
+  CUPTI_CBID_NVTX_FORCE_INT                             = 0x7fffffff
+} CUpti_nvtx_api_trace_cbid;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif    
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f42d14b938204b3b79c4ca1356b88896bcae35
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling.h
@@ -0,0 +1,936 @@
+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PCSAMPLING_H_)
+#define _CUPTI_PCSAMPLING_H_
+
+#include <cuda.h>
+#include <stdint.h>
+#include <stddef.h>
+#include "cupti_result.h"
+#include "cupti_common.h"
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PCSAMPLING_API CUPTI PC Sampling API
+ * Functions, types, and enums that implement the CUPTI PC Sampling API.
+ * @{
+ */
+
+#ifndef CUPTI_PCSAMPLING_STRUCT_SIZE
+#define CUPTI_PCSAMPLING_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+#ifndef CUPTI_STALL_REASON_STRING_SIZE
+#define CUPTI_STALL_REASON_STRING_SIZE                                            128
+#endif
+
+/**
+ * \brief PC Sampling collection mode
+ */
+typedef enum
+{
+  /**
+   * INVALID Value
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_INVALID                   = 0,
+  /**
+   * Continuous mode. Kernels are not serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS                = 1,
+  /**
+   * Serialized mode. Kernels are serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED         = 2,
+} CUpti_PCSamplingCollectionMode;
+
+/**
+ * \brief PC Sampling stall reasons
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [r] Collected stall reason index
+   */
+  uint32_t pcSamplingStallReasonIndex;
+  /**
+   * [r] Number of times the PC was sampled with the stallReason.
+   */
+  uint32_t samples;
+} CUpti_PCSamplingStallReason;
+
+/**
+ * \brief PC Sampling data
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [r] Unique cubin id
+   */
+  uint64_t cubinCrc;
+  /**
+   * [r] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+  /**
+   * Padding
+   */
+  uint32_t pad;
+  /**
+   * [r] The function name. This name string might be shared across all the records
+   * including records from activity APIs representing the same function, and so it should not be
+   * modified or freed until post processing of all the records is done. Once done, it is user’s responsibility to
+   * free the memory using free() function.
+   */
+  char* functionName;
+  /**
+   * [r] Collected stall reason count
+   */
+  size_t stallReasonCount;
+  /**
+   * [r] Stall reason id
+   * Total samples
+   */
+  CUpti_PCSamplingStallReason *stallReason;
+  /**
+   * The correlation ID of the kernel to which this result is associated. Only valid for serialized mode of pc sampling collection.
+   * For continous mode of collection the correlationId will be set to 0.
+   */
+  uint32_t correlationId;
+} CUpti_PCSamplingPCData;
+
+/**
+ * \brief PC Sampling output data format
+ */
+typedef enum
+{
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_INVALID          = 0,
+  /**
+   * HW buffer data will be parsed during collection of data
+   */
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED           = 1,
+} CUpti_PCSamplingOutputDataFormat;
+
+/**
+ * \brief Collected PC Sampling data
+ *
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Number of PCs to be collected
+   */
+  size_t collectNumPcs;
+  /**
+   * [r] Number of samples collected across all PCs.
+   * It includes samples for user modules, samples for non-user kernels and dropped samples.
+   * It includes counts for all non selected stall reasons.
+   * CUPTI does not provide PC records for non-user kernels.
+   * CUPTI does not provide PC records for instructions for which all selected stall reason metrics counts are zero.
+   */
+  uint64_t totalSamples;
+  /**
+   * [r] Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * [r] Number of PCs collected
+   */
+  size_t totalNumPcs;
+  /**
+   * [r] Number of PCs available for collection
+   */
+  size_t remainingNumPcs;
+  /**
+   * [r] Unique identifier for each range.
+   * Data collected across multiple ranges in multiple buffers can be identified using range id.
+   */
+  uint64_t rangeId;
+  /**
+   * [r] Profiled PC data
+   * This data struct should have enough memory to collect number of PCs mentioned in \brief collectNumPcs
+   */
+  CUpti_PCSamplingPCData *pPcData;
+  /**
+   * [r] Number of samples collected across all non user kernels PCs.
+   * It includes samples for non-user kernels.
+   * It includes counts for all non selected stall reasons as well.
+   * CUPTI does not provide PC records for non-user kernels.
+   */
+  uint64_t nonUsrKernelsTotalSamples;
+
+  /**
+   * [r] Status of the hardware buffer.
+   * CUPTI returns the error code CUPTI_ERROR_OUT_OF_MEMORY when hardware buffer is full.
+   * When hardware buffer is full, user will get pc data as 0. To mitigate this issue, one or more of the below options can be tried:
+   * 1. Increase the hardware buffer size using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+   * 2. Decrease the thread sleep span using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
+   * 3. Decrease the sampling frequency using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+   */
+  uint8_t hardwareBufferFull;
+} CUpti_PCSamplingData;
+
+/**
+ * \brief PC Sampling configuration attributes
+ *
+ * PC Sampling configuration attribute types. These attributes can be read
+ * using \ref cuptiPCSamplingGetConfigurationAttribute and can be written
+ * using \ref cuptiPCSamplingSetConfigurationAttribute. Attributes marked
+ * [r] can only be read using \ref cuptiPCSamplingGetConfigurationAttribute
+ * [w] can only be written using \ref cuptiPCSamplingSetConfigurationAttribute
+ * [rw] can be read using \ref cuptiPCSamplingGetConfigurationAttribute and
+ * written using \ref cuptiPCSamplingSetConfigurationAttribute
+ */
+typedef enum
+{
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_INVALID                            = 0,
+  /**
+   * [rw] Sampling period for PC Sampling.
+   * DEFAULT - CUPTI defined value based on number of SMs
+   * Valid values for the sampling
+   * periods are between 5 to 31 both inclusive. This will set the
+   * sampling period to (2^samplingPeriod) cycles.
+   * For e.g. for sampling period = 5 to 31, cycles = 32, 64, 128,..., 2^31
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD                    = 1,
+  /**
+   * [w] Number of stall reasons to collect.
+   * DEFAULT - All stall reasons will be collected
+   * Value is a size_t
+   * [w] Stall reasons to collect
+   * DEFAULT - All stall reasons will be collected
+   * Input value should be a pointer pointing to array of stall reason indexes
+   * containing all the stall reason indexes to collect.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON                       = 2,
+  /**
+   * [rw] Size of SW buffer for raw PC counter data downloaded from HW buffer
+   * DEFAULT - 1 MB, which can accommodate approximately 5500 PCs
+   * with all stall reasons
+   * Approximately it takes 16 Bytes (and some fixed size memory)
+   * to accommodate one PC with one stall reason
+   * For e.g. 1 PC with 1 stall reason = 32 Bytes
+   *          1 PC with 2 stall reason = 48 Bytes
+   *          1 PC with 4 stall reason = 96 Bytes
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE                = 3,
+  /**
+   * [rw] Size of HW buffer in bytes
+   * DEFAULT - 512 MB
+   * If sampling period is too less, HW buffer can overflow
+   * and drop PC data
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE               = 4,
+  /**
+   * [rw] PC Sampling collection mode
+   * DEFAULT - CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS
+   * Input value should be of type \ref CUpti_PCSamplingCollectionMode.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE                    = 5,
+  /**
+   * [rw] Control over PC Sampling data collection range
+   * Default - 0
+   * 1 - Allows user to start and stop PC Sampling using APIs -
+   * \ref cuptiPCSamplingStart() - Start PC Sampling
+   * \ref cuptiPCSamplingStop() - Stop PC Sampling
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL          = 6,
+  /**
+   * [w] Value for output data format
+   * Default - CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED
+   * Input value should be of type \ref CUpti_PCSamplingOutputDataFormat.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT                 = 7,
+  /**
+   * [w] Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Default - none.
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER               = 8,
+  /**
+   * [rw] Control sleep time of the worker threads created by CUPTI for various PC sampling operations.
+   * CUPTI creates multiple worker threads to offload certain operations to these threads. This includes decoding of HW data to
+   * the CUPTI PC sampling data and correlating PC data to SASS instructions. CUPTI wakes up these threads periodically.
+   * Default - 100 milliseconds.
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN  = 9,
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_FORCE_INT                          = 0x7fffffff,
+} CUpti_PCSamplingConfigurationAttributeType;
+
+/**
+ * \brief PC sampling configuration information structure
+ *
+ * This structure provides \ref CUpti_PCSamplingConfigurationAttributeType which can be configured
+ * or queried for PC sampling configuration
+ */
+typedef struct
+{
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationAttributeType for all supported attribute types
+   */
+  CUpti_PCSamplingConfigurationAttributeType attributeType;
+  /*
+   * Configure or query status for \p attributeType
+   * CUPTI_SUCCESS for valid \p attributeType and \p attributeData
+   * CUPTI_ERROR_INVALID_OPERATION if \p attributeData is not valid
+   * CUPTI_ERROR_INVALID_PARAMETER if \p attributeType is not valid
+   */
+  CUptiResult attributeStatus;
+  union
+  {
+    /**
+     * Invalid Value
+     */
+    struct
+    {
+      uint64_t data[3];
+    } invalidData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+     */
+    struct
+    {
+      uint32_t samplingPeriod;
+    } samplingPeriodData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON
+     */
+    struct
+    {
+      size_t stallReasonCount;
+      uint32_t *pStallReasonIndex;
+    } stallReasonData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t scratchBufferSize;
+    } scratchBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t hardwareBufferSize;
+    } hardwareBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE
+     */
+    struct
+    {
+      CUpti_PCSamplingCollectionMode collectionMode;
+    } collectionModeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+     */
+    struct
+    {
+      uint32_t enableStartStopControl;
+    } enableStartStopControlData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT
+     */
+    struct
+    {
+      CUpti_PCSamplingOutputDataFormat outputDataFormat;
+    } outputDataFormatData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER
+     */
+    struct
+    {
+      void *samplingDataBuffer;
+    } samplingDataBufferData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
+     */
+    struct
+    {
+      uint32_t workerThreadPeriodicSleepSpan;
+    } workerThreadPeriodicSleepSpanData;
+    
+  } attributeData;
+} CUpti_PCSamplingConfigurationInfo;
+
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure configures PC sampling using \ref cuptiPCSamplingSetConfigurationAttribute
+ * and queries PC sampling default configuration using \ref cuptiPCSamplingGetConfigurationAttribute
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingConfigurationInfoParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of attributes to configure using \ref cuptiPCSamplingSetConfigurationAttribute or query
+   * using \ref cuptiPCSamplingGetConfigurationAttribute
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+} CUpti_PCSamplingConfigurationInfoParams;
+#define CUpti_PCSamplingConfigurationInfoParamsSize                 CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingConfigurationInfoParams,pPCSamplingConfigurationInfo)
+
+/**
+ * \brief Write PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid \p attrib.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if attribute \p value is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingSetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+/**
+ * \brief Read PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid attribute.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p attrib is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT indicates that
+ * the \p value buffer is too small to hold the attribute value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetDataParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * \param pcSamplingData Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  void *pcSamplingData;
+} CUpti_PCSamplingGetDataParams;
+#define CUpti_PCSamplingGetDataParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetDataParams, pcSamplingData)
+/**
+ * \brief Flush GPU PC sampling data periodically.
+ *
+ * Flushing of GPU PC Sampling data is required at following point to maintain uniqueness of PCs:
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, after every module load-unload-load
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED, after every kernel ends
+ * If configuration option \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+ * is enabled, then after every range end i.e. \brief cuptiPCSamplingStop()
+ *
+ * If application is profiled in \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, with disabled
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL, and there is no module unload,
+ * user can collect data in two ways:
+ * Use \brief cuptiPCSamplingGetData() API periodically
+ * Use \brief cuptiPCSamplingDisable() on application exit and read GPU PC sampling data from sampling
+ * data buffer passed during configuration.
+ * Note: In case, \brief cuptiPCSamplingGetData() API is not called periodically, then sampling data buffer
+ * passed during configuration should be large enough to hold all PCs data.
+ *       \brief cuptiPCSamplingGetData() API never does device synchronization.
+ *       It is possible that when the API is called there is some unconsumed data from the HW buffer. In this case
+ * CUPTI provides only the data available with it at that moment.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetDataParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called without
+ * enabling PC sampling.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY indicates that the HW buffer is full
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingEnableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingEnableParams;
+#define CUpti_PCSamplingEnableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingEnableParams, ctx)
+
+/**
+ * \brief Enable PC sampling.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingEnableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingDisable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingDisableParams;
+#define CUpti_PCSamplingDisableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingDisableParams, ctx)
+
+/**
+ * \brief Disable PC sampling.
+ *
+ * For application which doesn't destroy the CUDA context explicitly,
+ * this API does the PC Sampling tear-down, joins threads and copies PC records in the buffer provided
+ * during the PC sampling configuration. PC records which can't be accommodated in the buffer are discarded.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingDisableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingStart
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStartParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStartParams;
+#define CUpti_PCSamplingStartParamsSize                             CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStartParams, ctx)
+
+/**
+ * \brief Start PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark starting of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingStartParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStart(CUpti_PCSamplingStartParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingStop
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStopParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStopParams;
+#define CUpti_PCSamplingStopParamsSize                              CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStopParams, ctx)
+
+/**
+ * \brief Stop PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark end of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingStopParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStop(CUpti_PCSamplingStopParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingGetNumStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetNumStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [r] Number of stall reasons
+   */
+  size_t *numStallReasons;
+} CUpti_PCSamplingGetNumStallReasonsParams;
+#define CUpti_PCSamplingGetNumStallReasonsParamsSize                CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetNumStallReasonsParams, numStallReasons)
+
+/**
+ * \brief Get PC sampling stall reason count.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetNumStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams);
+
+/**
+ * \brief Params for cuptiPCSamplingGetStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * [r] Stall reason index
+   */
+  uint32_t *stallReasonIndex;
+  /**
+   * [r] Stall reasons name
+   */
+  char **stallReasons;
+} CUpti_PCSamplingGetStallReasonsParams;
+#define CUpti_PCSamplingGetStallReasonsParamsSize                   CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetStallReasonsParams, stallReasons)
+
+/**
+ * \brief Get PC sampling stall reasons.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams);
+
+
+/**
+ * \brief Params for cuptiGetSassToSourceCorrelation
+ */
+typedef struct CUpti_GetSassToSourceCorrelationParams {
+  /**
+   * [w] Size of the data structure i.e. CUpti_GetSassToSourceCorrelationParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Pointer to cubin binary where function belongs.
+   */
+  const void* cubin;
+  /**
+   * [w] Function name to which PC belongs.
+   */
+  const char *functionName;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [r] Line number in the source code.
+   */
+  uint32_t lineNumber;
+  /**
+   * [w] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * [r] Path for the source file.
+   */
+  char *fileName;
+  /**
+   * [r] Path for the directory of source file.
+   */
+  char *dirName;
+} CUpti_GetSassToSourceCorrelationParams;
+
+#define CUpti_GetSassToSourceCorrelationParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetSassToSourceCorrelationParams, dirName)
+
+/**
+ * \brief SASS to Source correlation.
+ *
+ * \param pParams A pointer to \ref CUpti_GetSassToSourceCorrelationParams
+ *
+ * It is expected from user to free allocated memory for fileName and dirName after use.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either of the parameters cubin or functionName
+ * is NULL or cubinSize is zero or size field is not set correctly.
+ * \retval CUPTI_ERROR_INVALID_MODULE provided cubin is invalid.
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred.
+ * This error code is also used for cases when the function is not present in the module.
+ * A better error code will be returned in the future release.
+ */
+CUptiResult CUPTIAPI cuptiGetSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams);
+
+/**
+ * \brief Params for cuptiGetCubinCrc
+ */
+typedef struct {
+  /**
+   * [w] Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [w] Pointer to cubin binary
+   */
+  const void* cubin;
+  /**
+   * [r] Computed CRC will be stored in it.
+   */
+  uint64_t cubinCrc;
+} CUpti_GetCubinCrcParams;
+#define CUpti_GetCubinCrcParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetCubinCrcParams, cubinCrc)
+
+/**
+ * \brief Get the CRC of cubin.
+ *
+ * This function returns the CRC of provided cubin binary.
+ *
+ * \param pParams A pointer to \ref CUpti_GetCubinCrcParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if parameter cubin is NULL or
+ * provided cubinSize is zero or size field is not set.
+ */
+CUptiResult CUPTIAPI cuptiGetCubinCrc(CUpti_GetCubinCrcParams *pParams);
+
+/**
+ * \brief Function type for callback used by CUPTI to request crc of
+ * loaded module.
+ *
+ * This callback function ask for crc of provided module in function.
+ * The provided crc will be stored in PC sampling records i.e. in the field 'cubinCrc' of the PC sampling
+ * struct CUpti_PCSamplingPCData. The CRC is uses during the offline source correlation to uniquely identify the module.
+ *
+ * \param cubin The pointer to cubin binary
+ * \param cubinSize The size of cubin binary.
+ * \param cubinCrc Returns the computed crc of cubin.
+ */
+typedef void (CUPTIAPI *CUpti_ComputeCrcCallbackFunc)(
+    const void* cubin,
+    size_t cubinSize,
+    uint64_t *cubinCrc);
+
+/**
+ * \brief Register callback function with CUPTI to use
+ * your own algorithm to compute cubin crc.
+ *
+ * This function registers a callback function and it gets called
+ * from CUPTI when a CUDA module is loaded.
+ *
+ * \param funcComputeCubinCrc callback is invoked when a CUDA module
+ * is loaded.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcComputeCubinCrc is NULL.
+ */
+CUptiResult CUPTIAPI cuptiRegisterComputeCrcCallback(CUpti_ComputeCrcCallbackFunc funcComputeCubinCrc);
+
+/** @} */ /* END CUPTI_PCSAMPLING_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_PCSAMPLING_H_*/
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..595d6028fbf2ff9a3bbffaafe90ec80f7d512533
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_pcsampling_util.h
@@ -0,0 +1,402 @@
+#if !defined(_CUPTI_PCSAMPLING_UTIL_H_)
+#define _CUPTI_PCSAMPLING_UTIL_H_
+
+#include <cupti_pcsampling.h>
+#include <fstream>
+
+#include <cupti_common.h>
+
+#ifndef CUPTI_UTIL_STRUCT_SIZE
+#define CUPTI_UTIL_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+#ifndef CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS
+#define CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS(type, member, structSize)    \
+    (offsetof(type, member) < structSize)
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__)
+    #pragma GCC visibility push(default)
+#endif
+
+namespace CUPTI { namespace PcSamplingUtil {
+
+/**
+ * \defgroup CUPTI_PCSAMPLING_UTILITY CUPTI PC Sampling Utility API
+ * Functions, types, and enums that implement the CUPTI PC Sampling Utility API.
+ * @{
+ */
+
+/**
+ * \brief Header info will be stored in file.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Version of file format.
+   */
+  uint32_t version;
+  /**
+   * Total number of buffers present in the file.
+   */
+  uint32_t totalBuffers;
+} Header;
+
+/**
+ * \brief BufferInfo will be stored in the file for every buffer
+ *  i.e for every call of UtilDumpPcSamplingBufferInFile() API.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Total number of PC records.
+   */
+  uint64_t recordCount;
+  /**
+   * Count of all stall reasons supported on the GPU
+   */
+  size_t numStallReasons;
+  /**
+   * Total number of stall reasons in single record.
+   */
+  uint64_t numSelectedStallReasons;
+  /**
+   * Buffer size in Bytes.
+   */
+  uint64_t bufferByteSize;
+} BufferInfo;
+
+/**
+ * \brief All available stall reasons name and respective indexes
+ * will be stored in it.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Number of all available stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * Stall reasons names of all available stall reasons
+   */
+  char **stallReasons;
+  /**
+   * Stall reason index of all available stall reasons
+   */
+  uint32_t *stallReasonIndex;
+} PcSamplingStallReasons;
+
+/**
+ * \brief CUPTI PC sampling buffer types.
+ *
+ */
+typedef enum {
+  /**
+   * Invalid buffer type.
+   */
+  PC_SAMPLING_BUFFER_INVALID             = 0,
+  /**
+   * Refers to CUpti_PCSamplingData buffer.
+   */
+  PC_SAMPLING_BUFFER_PC_TO_COUNTER_DATA  = 1
+} PcSamplingBufferType;
+
+/**
+ * \brief CUPTI PC sampling utility API result codes.
+ *
+ * Error and result codes returned by CUPTI PC sampling utility API.
+ */
+typedef enum {
+  /**
+   * No error
+   */
+  CUPTI_UTIL_SUCCESS                                       = 0,
+  /**
+   * One or more of the parameters are invalid.
+   */
+  CUPTI_UTIL_ERROR_INVALID_PARAMETER                       = 1,
+  /**
+   * Unable to create a new file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE                   = 2,
+  /**
+   * Unable to open a file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE                     = 3,
+  /**
+   * Read or write operation failed
+   */
+  CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED             = 4,
+  /**
+   * Provided file handle is corrupted.
+   */
+  CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED                   = 5,
+  /**
+   * seek operation failed.
+   */
+  CUPTI_UTIL_ERROR_SEEK_OPERATION_FAILED                   = 6,
+  /**
+   * Unable to allocate enough memory to perform the requested
+   * operation.
+   */
+  CUPTI_UTIL_ERROR_OUT_OF_MEMORY                           = 7,
+  /**
+   * An unknown internal error has occurred.
+   */
+  CUPTI_UTIL_ERROR_UNKNOWN                                 = 999,
+  CUPTI_UTIL_ERROR_FORCE_INT                               = 0x7fffffff
+} CUptiUtilResult;
+
+/**
+ * \brief Params for \ref CuptiUtilPutPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * PC sampling buffer.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configured attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   * It is expected to provide configuration details of at least
+   * CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON attribute.
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+  /**
+   * File name to store buffer into it.
+   */
+  const char* fileName;
+} CUptiUtil_PutPcSampDataParams;
+#define CUptiUtil_PutPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_PutPcSampDataParams, fileName)
+
+/**
+ * \brief Dump PC sampling data into the file.
+ *
+ * This API can be called multiple times.
+ * It will append buffer in the file.
+ * For every buffer it will store BufferInfo
+ * so that before retrieving data it will help to allocate buffer
+ * to store retrieved data.
+ * This API creates file if file does not present.
+ * If stallReasonIndex or stallReasons pointer of \ref CUptiUtil_PutPcSampDataParams is NULL
+ * then stall reasons data  will not be stored in file.
+ * It is expected to store all available stall reason data at least once to refer it during
+ * offline correlation.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSamplingData, pParams pointer is NULL or stall reason configuration details not provided
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilPutPcSampData(CUptiUtil_PutPcSampDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetHeaderData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Header Info.
+   */
+  Header headerInfo;
+
+} CUptiUtil_GetHeaderDataParams;
+#define CUptiUtil_GetHeaderDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetHeaderDataParams, headerInfo)
+
+/**
+ * \brief Get header data of file.
+ *
+ * This API must be called once initially while retrieving data from file.
+ * \ref Header structure, it gives info about total number
+ * of buffers present in the file.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED  failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetHeaderData(CUptiUtil_GetHeaderDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetBufferInfo
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Buffer Info.
+   */
+  BufferInfo bufferInfoData;
+} CUptiUtil_GetBufferInfoParams;
+#define CUptiUtil_GetBufferInfoParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetBufferInfoParams, bufferInfoData)
+
+/**
+ * \brief Get buffer info data of file.
+ *
+ * This API must be called every time before calling CuptiUtilGetPcSampData API.
+ * \ref BufferInfo structure, it gives info about recordCount and stallReasonCount
+ * of every record in the buffer. This will help to allocate exact buffer to retrieve data into it.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetBufferInfo(CUptiUtil_GetBufferInfoParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilGetPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * Pointer to collected buffer info using \ref CuptiUtilGetBufferInfo
+   */
+  BufferInfo *pBufferInfoData;
+  /**
+   * Pointer to allocated memory to store retrieved data from file.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configuration attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   * For stallReasons field of \ref PcSamplingStallReasons it is expected to
+   * allocate memory for each string element of array.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+} CUptiUtil_GetPcSampDataParams;
+#define CUptiUtil_GetPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetPcSampDataParams, pPcSamplingStallReasons)
+
+/**
+ * \brief Retrieve PC sampling data from file into allocated buffer.
+ *
+ * This API must be called after CuptiUtilGetBufferInfo API.
+ * It will retrieve data from file into allocated buffer.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSampData, pParams is NULL. If pPcSamplingStallReasons is not NULL then
+ * error out if either of stallReasonIndex, stallReasons or stallReasons array element pointer is NULL.
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetPcSampData(CUptiUtil_GetPcSampDataParams *pParams);
+
+/**
+ * \brief Params for \ref CuptiUtilMergePcSampData
+ */
+typedef struct
+{
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Number of buffers to merge.
+   */
+  size_t numberOfBuffers;
+  /**
+   * Pointer to array of buffers to merge
+   */
+  CUpti_PCSamplingData *PcSampDataBuffer;
+  /**
+   * Pointer to array of merged buffers as per the range id.
+   */
+  CUpti_PCSamplingData **MergedPcSampDataBuffers;
+  /**
+   * Number of merged buffers.
+   */
+  size_t *numMergedBuffer;
+} CUptiUtil_MergePcSampDataParams;
+#define CUptiUtil_MergePcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_MergePcSampDataParams, numMergedBuffer)
+
+/**
+ * \brief Merge PC sampling data range id wise.
+ *
+ * This API merge PC sampling data range id wise.
+ * It allocates memory for merged data and fill data in it
+ * and provide buffer pointer in MergedPcSampDataBuffers field.
+ * It is expected from user to free merge data buffers after use.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if param struct size is invalid
+ * or count of buffers to merge is invalid i.e less than 1
+ * or either of PcSampDataBuffer, MergedPcSampDataBuffers, numMergedBuffer is NULL
+ * \retval CUPTI_UTIL_ERROR_OUT_OF_MEMORY Unable to allocate memory for merged buffer.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilMergePcSampData(CUptiUtil_MergePcSampDataParams *pParams);
+
+/** @} */ /* END CUPTI_PCSAMPLING_UTILITY */
+
+} }
+
+#if defined(__GNUC__)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_profiler_target.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_profiler_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..af41b55e2226539d69e0631ebb78185399e8b936
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_profiler_target.h
@@ -0,0 +1,601 @@
+/*
+ * Copyright 2011-2023   NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_PROFILER_TARGET_H_)
+#define _CUPTI_PROFILER_TARGET_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_PROFILER_API CUPTI Profiling API
+ * Functions, types, and enums that implement the CUPTI Profiling API.
+ * @{
+ */
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+/**
+ * \brief Profiler range attribute
+ *
+ * A metric enabled in the session's configuration is collected separately per unique range-stack in the pass.
+ * This is an attribute to collect metrics around each kernel in a profiling session or in an user defined range.
+ */
+typedef enum
+{
+    /**
+     * Invalid value
+     */
+    CUPTI_Range_INVALID,
+    /**
+     * Ranges are auto defined around each kernel in a profiling session
+     */
+    CUPTI_AutoRange,
+    /**
+     * A range in which metric data to be collected is defined by the user
+     */
+    CUPTI_UserRange,
+    /**
+     * Range count
+     */
+    CUPTI_Range_COUNT,
+} CUpti_ProfilerRange;
+
+/**
+ * \brief Profiler replay attribute
+ *
+ * For metrics which require multipass collection, a replay of the GPU kernel(s) is required.
+ * This is an attribute which specify how the replay of the kernel(s) to be measured is done.
+ */
+typedef enum
+{
+    /**
+     * Invalid Value
+     */
+    CUPTI_Replay_INVALID,
+    /**
+     * Replay is done by CUPTI user around the process
+     */
+    CUPTI_ApplicationReplay,
+    /**
+     * Replay is done around kernel implicitly by CUPTI
+     */
+    CUPTI_KernelReplay,
+    /**
+     * Replay is done by CUPTI user within a process
+     */
+    CUPTI_UserReplay,
+    /**
+     * Replay count
+     */
+    CUPTI_Replay_COUNT,
+} CUpti_ProfilerReplayMode;
+
+/**
+ * \brief Default parameter for cuptiProfilerInitialize
+ */
+typedef struct CUpti_Profiler_Initialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_Initialize_Params;
+#define CUpti_Profiler_Initialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_Initialize_Params, pPriv)
+
+/**
+ * \brief Default parameter for cuptiProfilerDeInitialize
+ */
+typedef struct CUpti_Profiler_DeInitialize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+} CUpti_Profiler_DeInitialize_Params;
+#define CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeInitialize_Params, pPriv)
+
+/**
+ * \brief Initializes the profiler interface
+ *
+ * Loads the required libraries in the process address space.
+ * Sets up the hooks with the CUDA driver.
+ */
+CUptiResult CUPTIAPI cuptiProfilerInitialize(CUpti_Profiler_Initialize_Params *pParams);
+
+/**
+ * \brief DeInitializes the profiler interface
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeInitialize(CUpti_Profiler_DeInitialize_Params *pParams);
+
+/**
+ * \brief Input parameter to define the counterDataImage
+ */
+typedef struct CUpti_Profiler_CounterDataImageOptions
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImageOptions_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    const uint8_t* pCounterDataPrefix;                          /**< [in] Address of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+                                                                    Must be align(8).*/
+    size_t counterDataPrefixSize;                               //!< [in] Size of CounterDataPrefix generated from NVPW_CounterDataBuilder_GetCounterDataPrefix().
+    uint32_t maxNumRanges;                                      //!< [in] Maximum number of ranges that can be profiled
+    uint32_t maxNumRangeTreeNodes;                              //!< [in] Maximum number of RangeTree nodes; must be >= maxNumRanges
+    uint32_t maxRangeNameLength;                                //!< [in] Maximum string length of each RangeName, including the trailing NULL character
+} CUpti_Profiler_CounterDataImageOptions;
+#define CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImageOptions, maxRangeNameLength)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateSize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE         CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateSize_Params, counterDataImageSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitialize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_Initialize_Params
+{
+    size_t structSize;                                          //!< [in] CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE
+    void* pPriv;                                                //!< [in] assign to NULL
+
+    size_t sizeofCounterDataImageOptions;                       //!< [in] CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE
+    const CUpti_Profiler_CounterDataImageOptions* pOptions;     //!< [in] Pointer to Counter Data Image Options
+    size_t counterDataImageSize;                                //!< [in] Size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                                 //!< [in] The buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_Initialize_Params;
+#define CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_Initialize_Params, pCounterDataImage)
+
+/**
+ * \brief A CounterData image allocates space for values for each counter for each range.
+ *
+ * User borne the resposibility of managing the counterDataImage allocations.
+ * CounterDataPrefix contains meta data about the metrics that will be stored in counterDataImage.
+ * Use these APIs to calculate the allocation size and initialize counterData image.
+ */
+CUptiResult cuptiProfilerCounterDataImageCalculateSize(CUpti_Profiler_CounterDataImage_CalculateSize_Params* pParams);
+CUptiResult cuptiProfilerCounterDataImageInitialize(CUpti_Profiler_CounterDataImage_Initialize_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageCalculateScratchBufferSize
+ */
+typedef struct CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [out]
+} CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params;
+#define CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params, counterDataScratchBufferSize)
+
+/**
+ * \brief Params for cuptiProfilerCounterDataImageInitializeScratchBuffer
+ */
+typedef struct CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in]
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated using cuptiProfilerCounterDataImageCalculateScratchBufferSize
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] the scratch buffer to be initialized.
+} CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params;
+#define CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE       CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params, pCounterDataScratchBuffer)
+
+/**
+ * \brief A temporary storage for CounterData image needed for internal operations
+ *
+ * Use these APIs to calculate the allocation size and initialize counterData image scratch buffer.
+ */
+CUptiResult cuptiProfilerCounterDataImageCalculateScratchBufferSize(CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params* pParams);
+CUptiResult cuptiProfilerCounterDataImageInitializeScratchBuffer(CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginSession
+ */
+typedef struct CUpti_Profiler_BeginSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t counterDataImageSize;                            //!< [in] size calculated from cuptiProfilerCounterDataImageCalculateSize
+    uint8_t* pCounterDataImage;                             //!< [in] address of CounterDataImage
+    size_t counterDataScratchBufferSize;                    //!< [in] size calculated from cuptiProfilerCounterDataImageInitializeScratchBuffer
+    uint8_t* pCounterDataScratchBuffer;                     //!< [in] address of CounterDataImage scratch buffer
+    uint8_t bDumpCounterDataInFile;                          //!< [in] [optional]
+    const char* pCounterDataFilePath;                        //!< [in] [optional]
+    CUpti_ProfilerRange range;                               //!< [in] CUpti_ProfilerRange
+    CUpti_ProfilerReplayMode replayMode;                     //!< [in] CUpti_ProfilerReplayMode
+    /* Replay options, required when replay is done by cupti user */
+    size_t maxRangesPerPass;                                //!< [in] Maximum number of ranges that can be recorded in a single pass.
+    size_t maxLaunchesPerPass;                              //!< [in] Maximum number of kernel launches that can be recorded in a single pass; must be >= maxRangesPerPass.
+
+} CUpti_Profiler_BeginSession_Params;
+#define CUpti_Profiler_BeginSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginSession_Params, maxLaunchesPerPass)
+/**
+ * \brief Params for cuptiProfilerEndSession
+ */
+typedef struct CUpti_Profiler_EndSession_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndSession_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EndSession_Params;
+#define CUpti_Profiler_EndSession_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndSession_Params, ctx)
+
+/**
+ * \brief Begin profiling session sets up the profiling on the device
+ *
+ * Although, it doesn't start the profiling but GPU resources needed for profiling are allocated.
+ * Outside of a session, the GPU will return to its normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerBeginSession(CUpti_Profiler_BeginSession_Params* pParams);
+/**
+ * \brief Ends profiling session
+ *
+ * Frees up the GPU resources acquired for profiling.
+ * Outside of a session, the GPU will return to it's normal operating state.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEndSession(CUpti_Profiler_EndSession_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerSetConfig
+ */
+typedef struct CUpti_Profiler_SetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_SetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const uint8_t* pConfig;                                 //!< [in] Config created by NVPW_RawMetricsConfig_GetConfigImage(). Must be align(8).
+    size_t configSize;                                      //!< [in] size of config
+    uint16_t minNestingLevel;                               //!< [in] the lowest nesting level to be profiled; must be >= 1
+    uint16_t numNestingLevels;                              //!< [in] the number of nesting levels to profile; must be >= 1
+    size_t passIndex;                                       //!< [in] Set this to zero for in-app replay; set this to the output of EndPass() for application replay
+    uint16_t targetNestingLevel;                            //!< [in] Set this to minNestingLevel for in-app replay; set this to the output of EndPass() for application
+} CUpti_Profiler_SetConfig_Params;
+
+#define CUpti_Profiler_SetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_SetConfig_Params, targetNestingLevel)
+
+/**
+ * \brief Params for cuptiProfilerUnsetConfig
+ */
+typedef struct CUpti_Profiler_UnsetConfig_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_UnsetConfig_Params;
+#define CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_UnsetConfig_Params, ctx)
+
+/**
+ * \brief Set metrics configuration to be profiled
+ *
+ * Use these APIs to set the config to profile in a session. It can be used for advanced cases such as where multiple
+ * configurations are collected into a single CounterData Image on the need basis, without restarting the session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerSetConfig(CUpti_Profiler_SetConfig_Params* pParams);
+/**
+ * \brief Unset metrics configuration profiled
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerUnsetConfig(CUpti_Profiler_UnsetConfig_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerBeginPass
+ */
+typedef struct CUpti_Profiler_BeginPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_BeginPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_BeginPass_Params;
+#define CUpti_Profiler_BeginPass_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_BeginPass_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerEndPass
+ */
+typedef struct CUpti_Profiler_EndPass_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EndPass_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    uint16_t targetNestingLevel;                            //!  [out] The targetNestingLevel that will be collected by the *next* BeginPass.
+    size_t passIndex;                                       //!< [out] The passIndex that will be collected by the *next* BeginPass
+    uint8_t allPassesSubmitted;                             //!< [out] becomes true when the last pass has been queued to the GPU
+} CUpti_Profiler_EndPass_Params;
+#define CUpti_Profiler_EndPass_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EndPass_Params, allPassesSubmitted)
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * It's a no-op in case of \ref CUPTI_KernelReplay.
+ */
+CUptiResult cuptiProfilerBeginPass(CUpti_Profiler_BeginPass_Params* pParams);
+
+/**
+ * \brief Replay API: used for multipass collection.
+
+ * These APIs are used if user chooses to replay by itself \ref CUPTI_UserReplay or \ref CUPTI_ApplicationReplay
+ * for multipass collection of the metrics configurations.
+ * Its a no-op in case of \ref CUPTI_KernelReplay.
+ * Returns information for next pass.
+ */
+CUptiResult cuptiProfilerEndPass(CUpti_Profiler_EndPass_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerEnableProfiling
+ */
+typedef struct CUpti_Profiler_EnableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_EnableProfiling_Params;
+#define CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_EnableProfiling_Params, ctx)
+
+/**
+ * \brief Params for cuptiProfilerDisableProfiling
+ */
+typedef struct CUpti_Profiler_DisableProfiling_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_DisableProfiling_Params;
+#define CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DisableProfiling_Params, ctx)
+
+/**
+ * \brief Enables Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerEnableProfiling(CUpti_Profiler_EnableProfiling_Params* pParams);
+
+/**
+ * \brief Disable Profiling
+ *
+ * In \ref CUPTI_AutoRange, these APIs are used to enable/disable profiling for the kernels to be executed in
+ * a profiling session.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDisableProfiling(CUpti_Profiler_DisableProfiling_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerIsPassCollected
+ */
+typedef struct CUpti_Profiler_IsPassCollected_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed pass
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+    uint8_t onePassCollected;                               //!< [out] true if a pass was successfully decoded
+    uint8_t allPassesCollected;                             //!< [out] becomes true when the last pass has been decoded
+} CUpti_Profiler_IsPassCollected_Params;
+#define CUpti_Profiler_IsPassCollected_Params_STRUCT_SIZE            CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_IsPassCollected_Params, allPassesCollected)
+
+/**
+ * \brief Asynchronous call to query if the submitted pass to GPU is collected
+ *
+ */
+CUptiResult CUPTIAPI cuptiProfilerIsPassCollected(CUpti_Profiler_IsPassCollected_Params* pParams);
+
+/**
+ * \brief Params for cuptiProfilerFlushCounterData
+ */
+typedef struct CUpti_Profiler_FlushCounterData_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    size_t numRangesDropped;                                //!< [out] number of ranges whose data was dropped in the processed passes
+    size_t numTraceBytesDropped;                            //!< [out] number of bytes not written to TraceBuffer due to buffer full
+} CUpti_Profiler_FlushCounterData_Params;
+#define CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_FlushCounterData_Params, numTraceBytesDropped)
+
+/**
+ * \brief Decode all the submitted passes
+ *
+ * Flush Counter data API to ensure every pass is decoded into the counterDataImage passed at beginSession.
+ * This will cause the CPU/GPU sync to collect all the undecoded pass.
+ */
+CUptiResult CUPTIAPI cuptiProfilerFlushCounterData(CUpti_Profiler_FlushCounterData_Params* pParams);
+
+typedef struct CUpti_Profiler_PushRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PushRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+    const char* pRangeName;                                 //!< [in] specifies the range for subsequent launches; must not be NULL
+    size_t rangeNameLength;                                 //!< [in] assign to strlen(pRangeName) if known; if set to zero, the library will call strlen()
+} CUpti_Profiler_PushRange_Params;
+#define CUpti_Profiler_PushRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PushRange_Params, rangeNameLength)
+
+typedef struct CUpti_Profiler_PopRange_Params
+{
+    size_t structSize;                                      //!< [in] CUpti_Profiler_PopRange_Params_STRUCT_SIZE
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    CUcontext ctx;                                          //!< [in] if NULL, the current CUcontext is used
+} CUpti_Profiler_PopRange_Params;
+#define CUpti_Profiler_PopRange_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_PopRange_Params, ctx)
+
+
+/**
+ * \brief Range API's : Push user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPushRange(CUpti_Profiler_PushRange_Params *pParams);
+
+/**
+ * \brief Range API's : Pop user range
+ *
+ * Counter data is collected per unique range-stack. Identified by a string label passsed by the user.
+ * It's an invalid operation in case of \ref CUPTI_AutoRange.
+ */
+CUptiResult CUPTIAPI cuptiProfilerPopRange(CUpti_Profiler_PopRange_Params *pParams);
+
+/**
+ * \brief Params for cuptiProfilerGetCounterAvailability
+ */
+typedef struct CUpti_Profiler_GetCounterAvailability_Params
+{
+    size_t structSize;                                  //!< [in] CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE
+    void* pPriv;                                        //!< [in] assign to NULL
+    CUcontext ctx;                                      //!< [in] if NULL, the current CUcontext is used
+    size_t counterAvailabilityImageSize;                //!< [in/out] If `pCounterAvailabilityImage` is NULL, then the required size is returned in
+                                                        //!< `counterAvailabilityImageSize`, otherwise `counterAvailabilityImageSize` should be set to the size of
+                                                        //!< `pCounterAvailabilityImage`, and on return it would be overwritten with number of actual bytes copied
+    uint8_t* pCounterAvailabilityImage;                 //!< [in] buffer receiving counter availability image, may be NULL
+} CUpti_Profiler_GetCounterAvailability_Params;
+#define CUpti_Profiler_GetCounterAvailability_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_GetCounterAvailability_Params, pCounterAvailabilityImage)
+
+/**
+ * \brief Query counter availibility
+ *
+ * Use this API to query counter availability information in a buffer which can be used to filter unavailable raw metrics on host.
+ * Note: This API may fail, if any profiling or sampling session is active on the specified context or its device.
+ */
+CUptiResult CUPTIAPI cuptiProfilerGetCounterAvailability(CUpti_Profiler_GetCounterAvailability_Params *pParams);
+
+/// Generic support level enum for CUPTI
+typedef enum
+{
+    CUPTI_PROFILER_CONFIGURATION_UNKNOWN = 0, //!< Configuration support level unknown - either detection code errored out before setting this value, or unable to determine it
+    CUPTI_PROFILER_CONFIGURATION_UNSUPPORTED, //!< Profiling is unavailable.  For specific feature fields, this means that the current configuration of this feature does not work with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would be returned for SLI on an SLI-enabled device.
+    CUPTI_PROFILER_CONFIGURATION_DISABLED,    //!< Profiling would be available for this configuration, but was disabled by the system
+    CUPTI_PROFILER_CONFIGURATION_SUPPORTED    //!< Profiling is supported.  For specific feature fields, this means that the current configuration of this feature works with profiling.  For instance, SLI-enabled devices do not support profiling, and this value would only be returned for devices which are not SLI-enabled.
+} CUpti_Profiler_Support_Level;
+
+/**
+ * \brief Profiler API types
+ */
+typedef enum
+{
+    CUPTI_PROFILER_RANGE_PROFILING = 0,       //!< CUPTI APIs for range based profiling (cuptiProfiler*)
+    CUPTI_PROFILER_PC_SAMPLING,               //!< CUPTI APIs collecting pc sampling data (cuptiPcSampling*)
+    CUPTI_PROFILER_SASS_METRICS,              //!< CUPTI APIs collecting SASS metrics data (cuptiSassMetrics*)
+    CUPTI_PROFILER_UNKNOWN
+} CUpti_Profiler_API;
+
+/**
+ * \brief Params for cuptiProfilerDeviceSupported
+ */
+typedef struct
+{
+    size_t structSize;                                //!< [in] Must be CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE
+    void *pPriv;                                      //!< [in] assign to NULL
+    CUdevice cuDevice;                                //!< [in] if NULL, the current CUcontext is used
+
+    CUpti_Profiler_Support_Level isSupported;         //!< [out] overall SUPPORTED / UNSUPPORTED flag representing whether Profiling and PC Sampling APIs work on the given device and configuration. SUPPORTED if all following flags are SUPPORTED, UNSUPPORTED otherwise.
+
+    CUpti_Profiler_Support_Level architecture;        //!< [out] SUPPORTED if the device architecture level supports the Profiling API (Compute Capability >= 7.0), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level sli;                 //!< [out] SUPPORTED if SLI is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level vGpu;                //!< [out] SUPPORTED if vGPU is supported and profiling is enabled, DISABLED if profiling is supported but not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level confidentialCompute; //!< [out] SUPPORTED if confidential compute is not enabled, UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level cmp;                 //!< [out] SUPPORTED if not NVIDIA Crypto Mining Processors (CMP), UNSUPPORTED otherwise
+    CUpti_Profiler_Support_Level wsl;                 //!< [out] SUPPORTED if WSL supported, UNSUPPORTED otherwise
+    CUpti_Profiler_API     api;                       //!< [in] the CUPTI API type for which device support will be checked
+} CUpti_Profiler_DeviceSupported_Params;
+#define CUpti_Profiler_DeviceSupported_Params_STRUCT_SIZE CUPTI_PROFILER_STRUCT_SIZE(CUpti_Profiler_DeviceSupported_Params, api)
+
+/**
+ * \brief Query device compatibility with Profiling API
+ *
+ * Use this call to determine whether a compute device and configuration are compatible with the Profiling API.
+ * If the configuration does not support profiling, one of several flags will indicate why.
+ */
+CUptiResult CUPTIAPI cuptiProfilerDeviceSupported(CUpti_Profiler_DeviceSupported_Params *pParams);
+
+/** @} */ /* END CUPTI_METRIC_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /*_CUPTI_PROFILER_TARGET_H_*/
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b0dff85c12e8fffd7bc3beb81e6846eaea3a481
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_result.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright 2010-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_RESULT_H_)
+#define _CUPTI_RESULT_H_
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_RESULT_API CUPTI Result Codes
+ * Error and result codes returned by CUPTI functions.
+ * @{
+ */
+
+/**
+ * \brief CUPTI result codes.
+ *
+ * Error and result codes returned by CUPTI functions.
+ */
+typedef enum {
+    /**
+     * No error.
+     */
+    CUPTI_SUCCESS                                       = 0,
+    /**
+     * One or more of the parameters is invalid.
+     */
+    CUPTI_ERROR_INVALID_PARAMETER                       = 1,
+    /**
+     * The device does not correspond to a valid CUDA device.
+     */
+    CUPTI_ERROR_INVALID_DEVICE                          = 2,
+    /**
+     * The context is NULL or not valid.
+     */
+    CUPTI_ERROR_INVALID_CONTEXT                         = 3,
+    /**
+     * The event domain id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID                 = 4,
+    /**
+     * The event id is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_ID                        = 5,
+    /**
+     * The event name is invalid.
+     */
+    CUPTI_ERROR_INVALID_EVENT_NAME                      = 6,
+    /**
+     * The current operation cannot be performed due to dependency on
+     * other factors.
+     */
+    CUPTI_ERROR_INVALID_OPERATION                       = 7,
+    /**
+     * Unable to allocate enough memory to perform the requested
+     * operation.
+     */
+    CUPTI_ERROR_OUT_OF_MEMORY                           = 8,
+    /**
+     * An error occurred on the performance monitoring hardware.
+     */
+    CUPTI_ERROR_HARDWARE                                = 9,
+    /**
+     * The output buffer size is not sufficient to return all
+     * requested data.
+     */
+    CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT           = 10,
+    /**
+     * API is not implemented.
+     */
+    CUPTI_ERROR_API_NOT_IMPLEMENTED                     = 11,
+    /**
+     * The maximum limit is reached.
+     */
+    CUPTI_ERROR_MAX_LIMIT_REACHED                       = 12,
+    /**
+     * The object is not yet ready to perform the requested operation.
+     */
+    CUPTI_ERROR_NOT_READY                               = 13,
+    /**
+     * The current operation is not compatible with the current state
+     * of the object
+     */
+    CUPTI_ERROR_NOT_COMPATIBLE                          = 14,
+    /**
+     * CUPTI is unable to initialize its connection to the CUDA
+     * driver.
+     */
+    CUPTI_ERROR_NOT_INITIALIZED                         = 15,
+    /**
+     * The metric id is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_ID                        = 16,
+    /**
+     * The metric name is invalid.
+     */
+    CUPTI_ERROR_INVALID_METRIC_NAME                      = 17,
+    /**
+     * The queue is empty.
+     */
+    CUPTI_ERROR_QUEUE_EMPTY                              = 18,
+    /**
+     * Invalid handle (internal?).
+     */
+    CUPTI_ERROR_INVALID_HANDLE                           = 19,
+    /**
+     * Invalid stream.
+     */
+    CUPTI_ERROR_INVALID_STREAM                           = 20,
+    /**
+     * Invalid kind.
+     */
+    CUPTI_ERROR_INVALID_KIND                             = 21,
+    /**
+     * Invalid event value.
+     */
+    CUPTI_ERROR_INVALID_EVENT_VALUE                      = 22,
+    /**
+     * CUPTI is disabled due to conflicts with other enabled profilers
+     */
+    CUPTI_ERROR_DISABLED                                 = 23,
+    /**
+     * Invalid module.
+     */
+    CUPTI_ERROR_INVALID_MODULE                           = 24,
+    /**
+     * Invalid metric value.
+     */
+    CUPTI_ERROR_INVALID_METRIC_VALUE                     = 25,
+    /**
+     * The performance monitoring hardware is in use by other client.
+     */
+    CUPTI_ERROR_HARDWARE_BUSY                            = 26,
+    /**
+     * The attempted operation is not supported on the current
+     * system or device.
+     */
+    CUPTI_ERROR_NOT_SUPPORTED                            = 27,
+    /**
+     * Unified memory profiling is not supported on the system.
+     * Potential reason could be unsupported OS or architecture.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED               = 28,
+    /**
+     * Unified memory profiling is not supported on the device
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE     = 29,
+    /**
+     * Unified memory profiling is not supported on a multi-GPU
+     * configuration without P2P support between any pair of devices
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES = 30,
+    /**
+     * Unified memory profiling is not supported under the
+     * Multi-Process Service (MPS) environment. CUDA 7.5 removes this
+     * restriction.
+     */
+    CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_WITH_MPS      = 31,
+    /**
+     * In CUDA 9.0, devices with compute capability 7.0 don't
+     * support CDP tracing
+     */
+    CUPTI_ERROR_CDP_TRACING_NOT_SUPPORTED                = 32,
+    /**
+     * Profiling on virtualized GPU is not supported.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_NOT_SUPPORTED         = 33,
+    /**
+     * Profiling results might be incorrect for CUDA applications
+     * compiled with nvcc version older than 9.0 for devices with
+     * compute capability 6.0 and 6.1.
+     * Profiling session will continue and CUPTI will notify it using this error code.
+     * User is advised to recompile the application code with nvcc version 9.0 or later.
+     * Ignore this warning if code is already compiled with the recommended nvcc version.
+     */
+    CUPTI_ERROR_CUDA_COMPILER_NOT_COMPATIBLE             = 34,
+    /**
+     * User doesn't have sufficient privileges which are required to
+     * start the profiling session.
+     * One possible reason for this may be that the NVIDIA driver or your system
+     * administrator may have restricted access to the NVIDIA GPU performance counters.
+     * To learn how to resolve this issue and find more information, please visit
+     * https://developer.nvidia.com/CUPTI_ERROR_INSUFFICIENT_PRIVILEGES
+     */
+    CUPTI_ERROR_INSUFFICIENT_PRIVILEGES                  = 35,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not compatible with the
+     * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+     * in the headers nvperf_host.h and nvperf_target.h.
+     */
+    CUPTI_ERROR_OLD_PROFILER_API_INITIALIZED             = 36,
+    /**
+     * Missing definition of the OpenACC API routine in the linked OpenACC library.
+     *
+     * One possible reason is that OpenACC library is linked statically in the
+     * user application, which might not have the definition of all the OpenACC
+     * API routines needed for the OpenACC profiling, as compiler might ignore
+     * definitions for the functions not used in the application. This issue
+     * can be mitigated by linking the OpenACC library dynamically.
+     */
+    CUPTI_ERROR_OPENACC_UNDEFINED_ROUTINE                = 37,
+    /**
+     * Legacy CUPTI Profiling API i.e. event API from the header cupti_events.h and
+     * metric API from the header cupti_metrics.h are not supported on devices with
+     * compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+     * These API will be deprecated in a future CUDA release. These are replaced by
+     * Profiling API in the header cupti_profiler_target.h and Perfworks metrics API
+     * in the headers nvperf_host.h and nvperf_target.h.
+     */
+    CUPTI_ERROR_LEGACY_PROFILER_NOT_SUPPORTED            = 38,
+    /**
+     * CUPTI doesn't allow multiple callback subscribers. Only a single subscriber
+     * can be registered at a time.
+     * Same error code is used when application is launched using NVIDIA tools
+     * like nvprof, Visual Profiler, Nsight Systems, Nsight Compute, cuda-gdb and
+     * cuda-memcheck.
+     */
+    CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED       = 39,
+    /**
+     * Profiling on virtualized GPU is not allowed by hypervisor.
+     */
+    CUPTI_ERROR_VIRTUALIZED_DEVICE_INSUFFICIENT_PRIVILEGES = 40,
+    /**
+     * Profiling and tracing are not allowed when confidential computing mode
+     * is enabled.
+     */
+    CUPTI_ERROR_CONFIDENTIAL_COMPUTING_NOT_SUPPORTED = 41,
+    /**
+     * CUPTI does not support NVIDIA Crypto Mining Processors (CMP).
+     * For more information, please visit https://developer.nvidia.com/ERR_NVCMPGPU
+    */
+    CUPTI_ERROR_CMP_DEVICE_NOT_SUPPORTED = 42,
+    /**
+     * An unknown internal error has occurred.
+     */
+    CUPTI_ERROR_UNKNOWN                                  = 999,
+    CUPTI_ERROR_FORCE_INT                                = 0x7fffffff
+} CUptiResult;
+
+/**
+ * \brief Get the descriptive string for a CUptiResult.
+ *
+ * Return the descriptive string for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param result The result to get the string for
+ * \param str Returns the string
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ */
+CUptiResult CUPTIAPI cuptiGetResultString(CUptiResult result, const char **str);
+
+/**
+ * @brief Get the descriptive message corresponding to error codes returned
+ * by CUPTI.
+ * 
+ * Return the descriptive error message for a CUptiResult in \p *str.
+ * \note \b Thread-safety: this function is thread safe.
+ * 
+ * \param result The result to get the descriptive error message for
+ * \param str Returns the error message string
+ * 
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p str is NULL or \p
+ * result is not a valid CUptiResult
+ * 
+ */
+
+CUptiResult CUPTIAPI cuptiGetErrorMessage(CUptiResult result, const char **str);
+
+/** @} */ /* END CUPTI_RESULT_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_RESULT_H_*/
+
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h
new file mode 100644
index 0000000000000000000000000000000000000000..1db2cea872a5fce3b537df9770f7123d3796f6d6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_runtime_cbid.h
@@ -0,0 +1,481 @@
+
+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+
+typedef enum CUpti_runtime_api_trace_cbid_enum {
+    CUPTI_RUNTIME_TRACE_CBID_INVALID                                                       = 0,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020                                    = 1,
+    CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020                                   = 2,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020                                      = 3,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020                                 = 4,
+    CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020                                        = 5,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020                                      = 6,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020                                   = 7,
+    CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020                                       = 8,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020                                       = 9,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020                                        = 10,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020                                     = 11,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020                                      = 12,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020                                              = 13,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020                                  = 14,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020                                   = 15,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020                                           = 16,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020                                           = 17,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020                                     = 18,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020                                      = 19,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020                                              = 20,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020                                         = 21,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020                                                = 22,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020                                         = 23,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020                                           = 24,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020                                          = 25,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020                                            = 26,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020                                           = 27,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020                                = 28,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020                                        = 29,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020                                          = 30,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020                                              = 31,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020                                            = 32,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020                                       = 33,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020                                     = 34,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020                                     = 35,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020                                   = 36,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020                                  = 37,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020                                = 38,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020                                      = 39,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020                                    = 40,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020                                         = 41,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020                                  = 42,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020                                = 43,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020                                       = 44,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020                                = 45,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020                              = 46,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020                                 = 47,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020                               = 48,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020                                              = 49,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020                                            = 50,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020                                         = 51,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020                                       = 52,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020                                    = 53,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020                                       = 54,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020                                         = 55,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020                                       = 56,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020                                  = 57,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020                                       = 58,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020                           = 59,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020                                 = 60,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020                                  = 61,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020                                 = 62,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020                                       = 63,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020                              = 64,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020                                   = 65,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020                                 = 66,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020                            = 67,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020                           = 68,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020                              = 69,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020                            = 70,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020                                        = 71,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020                             = 72,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020                            = 73,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020                          = 74,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020                         = 75,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020                                = 76,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020                              = 77,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020                    = 78,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020                   = 79,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020                                      = 80,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020                                 = 81,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020                   = 82,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020                  = 83,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020                                      = 84,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020                                     = 85,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020                              = 86,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020                       = 87,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020                                      = 88,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020                                     = 89,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020                              = 90,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020                       = 91,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020                               = 92,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020                             = 93,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020                                   = 94,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020                                 = 95,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020                            = 96,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020                   = 97,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020                         = 98,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020                       = 99,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020                          = 100,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020                         = 101,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020                                       = 102,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020                                      = 103,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020                               = 104,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020                               = 105,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020                        = 106,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020                                = 107,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020                              = 108,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020                                    = 109,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020                                  = 110,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020                             = 111,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020                    = 112,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020                          = 113,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020                        = 114,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020                           = 115,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020                          = 116,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020                                           = 117,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020                                             = 118,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020                            = 119,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020                          = 120,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020                                 = 121,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020                               = 122,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020                                          = 123,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020                                  = 124,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020                                    = 125,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020                                   = 126,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020                                      = 127,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020                                      = 128,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020                                        = 129,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020                                       = 130,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020                                   = 131,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020                                         = 132,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020                                         = 133,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020                                = 134,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020                                         = 135,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020                                        = 136,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020                                    = 137,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020                                          = 138,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020                                    = 139,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020                                            = 140,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020                                       = 141,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020                                            = 142,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020                                       = 143,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020                                            = 144,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020                                       = 145,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020                                = 146,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020                                     = 147,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020                              = 148,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020                              = 149,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020                                = 150,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000                                = 151,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000                                        = 152,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000                                      = 153,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000                                 = 154,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000                              = 155,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000                             = 156,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000                                        = 157,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000                                      = 158,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000                                = 159,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000                                          = 160,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000                                     = 161,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000                                        = 162,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000                                   = 163,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020                                         = 164,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020                                   = 165,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020                                      = 166,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020                                      = 167,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020                                = 168,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020                                = 169,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000                                  = 170,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000                                       = 171,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000                                        = 172,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010                                 = 173,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010                                   = 174,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010                                        = 175,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010                                   = 176,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010                                  = 177,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010                                     = 178,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010                                    = 179,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010                                   = 180,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010                                        = 181,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020                              = 182,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020                            = 183,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020                            = 184,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000                                 = 185,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000                                = 186,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000                        = 187,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000                         = 188,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000                                 = 189,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000                                = 190,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000                        = 191,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000                                = 192,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000                              = 193,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000                                  = 194,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000                         = 195,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000             = 196,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000                                   = 197,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000                               = 198,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000                    = 199,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000                                  = 200,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050                                       = 201,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050                            = 202,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050                                   = 203,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050                                      = 204,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050                        = 205,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000                                       = 206,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000           = 207,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000                                = 208,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050                                        = 209,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050           = 210,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000                                        = 211,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000                                      = 212,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000                                         = 213,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000                                   = 214,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000                                         = 215,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000                                       = 216,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000                                  = 217,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000                                = 218,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000                                = 219,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000                              = 220,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000                             = 221,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000                           = 222,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000                                 = 223,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000                               = 224,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000                                    = 225,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000                             = 226,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000                           = 227,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000                                  = 228,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000                           = 229,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000                         = 230,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000                            = 231,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000                          = 232,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000                                         = 233,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000                                       = 234,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000                                    = 235,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000                                  = 236,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000                              = 237,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000                                 = 238,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000                              = 239,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000                                    = 240,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000                           = 241,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000                                    = 242,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000                                       = 243,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000                                  = 244,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000                                       = 245,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000                                  = 246,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000                                = 247,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000                              = 248,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000                                   = 249,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000                              = 250,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000  = 251,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000                                    = 252,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000                               = 253,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000                                           = 254,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000                               = 255,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000                            = 256,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000                            = 257,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000                         = 258,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000                       = 259,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000                       = 260,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000                            = 261,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000                         = 262,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000                       = 263,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000                        = 264,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000                   = 265,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000                                = 266,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000                               = 267,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000                   = 268,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000                             = 269,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000                        = 270,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000                              = 271,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000                  = 272,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000                                    = 273,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000                               = 274,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000                      = 275,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000              = 276,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000                              = 277,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000                            = 278,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000                      = 279,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000                 = 280,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000                        = 281,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000                   = 282,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000                           = 283,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000                                     = 284,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000                                = 285,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000                                        = 286,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000                           = 287,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000                           = 288,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000                                 = 289,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000                                 = 290,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000                           = 291,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000                           = 292,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000                                 = 293,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000                           = 294,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000                           = 295,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000                                   = 296,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000                             = 297,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000                             = 298,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000                        = 299,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000                                  = 300,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000                                         = 301,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000                               = 302,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000                                   = 303,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000                                  = 304,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000                           = 305,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000                         = 306,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000                               = 307,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000                            = 308,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000                                   = 309,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000                                   = 310,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000                                        = 311,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000                                   = 312,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000                                   = 313,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000                                       = 314,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000                                 = 315,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000                            = 316,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000                                  = 317,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000                             = 318,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000                                   = 319,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000                              = 320,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000                             = 321,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000                                      = 322,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000                                      = 323,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010                               = 324,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010                          = 325,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010                       = 326,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010                    = 327,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020                       = 328,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200              = 329,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200                                     = 330,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200                                = 331,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020                       = 332,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020                       = 333,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020                         = 334,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020                                    = 335,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000                                    = 336,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000                          = 337,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000                      = 338,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000                        = 339,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000                        = 340,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000                               = 341,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000                          = 342,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000                                 = 343,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000                            = 344,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000                                 = 345,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000                            = 346,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010                   = 347,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000                                        = 348,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000                                   = 349,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010                         = 350,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010                       = 351,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010                               = 352,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010                   = 353,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010                 = 354,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010                         = 355,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010               = 356,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010             = 357,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010                     = 358,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010                           = 359,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010                  = 360,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010                   = 361,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010                            = 362,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010                       = 363,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010                       = 364,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010                              = 365,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010                         = 366,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010                         = 367,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010                   = 368,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010                     = 369,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010                               = 370,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010                          = 371,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020                            = 372,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020                                        = 373,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020                                   = 374,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020                                          = 375,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020                                     = 376,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020                                      = 377,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020                                = 378,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020                                = 379,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020                                   = 380,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020                                      = 381,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020                                   = 382,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020                                      = 383,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020                                     = 384,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020                                   = 385,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020                                   = 386,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020                     = 387,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020                   = 388,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020                               = 389,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020                               = 390,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020                                = 391,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020                           = 392,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020                   = 393,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020              = 394,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020                     = 395,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020                = 396,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020               = 397,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020         = 398,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020         = 399,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020                 = 400,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020           = 401,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020           = 402,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020     = 403,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020       = 404,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030                     = 405,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030                                = 406,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030                           = 407,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030                                 = 408,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030                            = 409,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030                       = 410,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030                    = 411,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030               = 412,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030                                   = 413,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030                                   = 414,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030                                  = 415,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030                              = 416,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030                             = 417,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040                          = 418,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040                               = 419,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040                         = 420,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040                                = 421,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040                          = 422,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040                                 = 423,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040                         = 424,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040                         = 425,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060                                = 426,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060                                = 427,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060                         = 428,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060                = 429,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060                                    = 430,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060                               = 431,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070                   = 432,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070                         = 433,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v2_v11080                             = 434,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v2_v11080                     = 435,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_v12000                         = 436,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_ptsz_v12000                    = 437,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecGetFlags_v12000                                  = 438,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetKernel_v12000                                          = 439,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v2_v12000                             = 440,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_v12000                                        = 441,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_ptsz_v12000                                   = 442,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v12000                                   = 443,
+    CUPTI_RUNTIME_TRACE_CBID_cudaInitDevice_v12000                                         = 444,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v12020                                       = 445,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetParams_v12020                                 = 446,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecNodeSetParams_v12020                             = 447,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v2_v12020                                       = 448,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_v12020                                = 449,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_ptsz_v12020                           = 450,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetName_v12030                                        = 451,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_v12030                          = 452,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_ptsz_v12030                     = 453,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphConditionalHandleCreate_v12030                       = 454,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v2_v12030                                   = 455,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v2_v12030                        = 456,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v2_v12030                      = 457,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v2_v12030                            = 458,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v2_v12030                         = 459,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v2_v12030                                    = 460,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_v12030                            = 461,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_ptsz_v12030                       = 462,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_v12030                 = 463,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030            = 464,
+    CUPTI_RUNTIME_TRACE_CBID_cuda465_v12040                                                = 465,
+    CUPTI_RUNTIME_TRACE_CBID_cuda466_v12040                                                = 466,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetParamInfo_v12040                                   = 467,
+    CUPTI_RUNTIME_TRACE_CBID_SIZE                                                          = 468,
+    CUPTI_RUNTIME_TRACE_CBID_FORCE_INT                                                     = 0x7fffffff
+} CUpti_runtime_api_trace_cbid;
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_sass_metrics.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_sass_metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..acb59cf8e5882a5ff13b4a1b0fdc6bc7b0ec47f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_sass_metrics.h
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2023 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_SASS_METRICS_H_)
+#define _CUPTI_SASS_METRICS_H_
+
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_profiler_target.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_SASS_METRICS_API CUPTI SASS Metrics API
+ * Functions, types, and enums that implement the CUPTI SASS Metrics API.
+ * @{
+ */
+
+typedef enum
+{
+    /// SASS metric data will be collected at GPU level. 
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to 1
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_GPU = 0,
+
+    /// SASS metric data will be collected at SM level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SMs in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SM = 1,
+
+    /// SASS metric data will be collected at SM sub-partition level
+    /// In CUpti_SassMetricsGetDataProperties_Params struct the numOfInstances will be equal to number of SM sub-partitions in the GPU
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_SMSP = 2,
+
+    CUPTI_SASS_METRICS_OUTPUT_GRANULARITY_INVALID
+} CUpti_SassMetrics_OutputGranularity;
+
+typedef struct CUpti_SassMetrics_MetricDetails
+{
+    /// unique ID for the SASS metric
+    uint64_t metricId;
+    /// metric name
+    const char* pMetricName;
+    /// metric description
+    const char* pMetricDescription;
+} CUpti_SassMetrics_MetricDetails;
+
+/**
+ * \brief Params for cuptiSassMetricsGetNumOfMetrics
+ */
+typedef struct CUpti_SassMetrics_GetNumOfMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [out] number of metrics supported for the queried chip
+    size_t numOfMetrics;
+} CUpti_SassMetrics_GetNumOfMetrics_Params;
+
+#define CUpti_SassMetrics_GetNumOfMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetNumOfMetrics_Params, numOfMetrics)
+
+/**
+ * \brief Get the number of supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetNumOfMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetNumOfMetrics(CUpti_SassMetrics_GetNumOfMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetMetrics
+ */
+typedef struct CUpti_SassMetrics_GetMetrics_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metrics will be queried
+    const char* pChipName;
+    /// [in] number of metrics supported for the queried chip (can be queried using cuptiSassMetricsGetNumOfMetrics())
+    size_t numOfMetrics;
+    /// [out] list of metrics supported for queried chip
+    CUpti_SassMetrics_MetricDetails* pMetricsList;
+} CUpti_SassMetrics_GetMetrics_Params;
+#define CUpti_SassMetrics_GetMetrics_Params_STRUCT_SIZE                 CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetMetrics_Params, pMetricsList)
+
+/**
+ * \brief Get the list of all supported SASS metrics for the chip.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetMetrics_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetMetrics(CUpti_SassMetrics_GetMetrics_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetProperties
+ */
+typedef struct CUpti_SassMetrics_GetProperties_Params
+{
+    /// [in] should be equal to CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] chip name for which metric will be queried
+    const char* pChipName;
+    /// [in] metric name
+    const char* pMetricName;
+    /// [out] returns the metric ID and the metric description
+    CUpti_SassMetrics_MetricDetails metric;
+} CUpti_SassMetrics_GetProperties_Params;
+#define CUpti_SassMetrics_GetProperties_Params_STRUCT_SIZE        CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_GetProperties_Params, metric)
+
+/**
+ * \brief Get metric properties for the queried metric.
+ * For a given metric the results will be put in CUpti_SassMetrics_MetricDetails which
+ * stores metric ID, description of the metric.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetrics_GetProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetProperties(CUpti_SassMetrics_GetProperties_Params *pParams);
+
+typedef struct CUpti_SassMetrics_Config
+{
+    /// [in] unique id for the SASS metric, can be queried using cuptiSassMetricsGetProperties()
+    uint64_t metricId;
+    /// [in] CUpti_SassMetrics_OutputGranularity
+    uint8_t outputGranularity;
+} CUpti_SassMetrics_Config;
+
+/**
+ * \brief Params for cuptiSassMetricsSetConfig
+ */
+typedef struct CUpti_SassMetricsSetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] num of metric configs, will be equal to number of metrics queried
+    size_t numOfMetricConfig;
+    /// [in] list of metric config generated for given sass metrics
+    CUpti_SassMetrics_Config* pConfigs;
+    /// [in] device index for which config will be set, user can call this once for
+    /// the device on which the the SASS metric data will be collected
+    uint32_t deviceIndex;
+} CUpti_SassMetricsSetConfig_Params;
+#define CUpti_SassMetricsSetConfig_Params_STRUCT_SIZE                    CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsSetConfig_Params, deviceIndex)
+
+/**
+ * \brief Set config for the SASS metric data collection for a device.
+ * User need to call this API before calling any of the SASS metric data collection APIs.
+ * Each set config API call need to be followed by cuptiSassPatchingUnSetConfig API
+ * before calling the cuptiSassMetricsSetConfig() API again for the same device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling unset config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsSetConfig(CUpti_SassMetricsSetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsUnsetConfig
+ */
+typedef struct CUpti_SassMetricsUnsetConfig_Params
+{
+    /// [in] equal to CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] device index for which SASS metric data collection config will get reset, user need to call this API for
+    /// all the devices on which the the SASS metric data collection have been configured.
+    uint32_t deviceIndex;
+} CUpti_SassMetricsUnsetConfig_Params;
+#define CUpti_SassMetricsUnsetConfig_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsUnsetConfig_Params, deviceIndex)
+
+/**
+ * \brief Unset config API will reset the SASS metric data collection configuration for the device.
+ * Once this API called CUPTI will deallocate all the memory allocated and remove all
+ * the configuration for SASS metric data collection. User can only call this API for a device where
+ * cuptiSassMetricsSetConfig() API has been called earlier for the device.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsSetConfig_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this is called multiple times for the device without calling set config API
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsUnsetConfig(CUpti_SassMetricsUnsetConfig_Params *pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsEnable
+ */
+typedef struct CUpti_SassMetricsEnable_Params
+{
+    /// [in] equal to CUpti_SassMetricsEnable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] if false, all the functions will patched regardless of their execution with cuptiSassMetricsEnable() API call.
+    /// when this parameter is set to true, metric data collection for the function will be done at the very first execution in the enable/disble
+    /// range.
+    uint8_t enableLazyPatching;
+} CUpti_SassMetricsEnable_Params;
+#define CUpti_SassMetricsEnable_Params_STRUCT_SIZE                       CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsEnable_Params, enableLazyPatching)
+
+/**
+ * \brief Sass metric data collection enable API will mark the start of a range, between which kernel
+ *  will be profiled for SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsEnable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsDisable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsEnable(CUpti_SassMetricsEnable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsDisable
+ */
+typedef struct CUpti_SassMetricsDisable_Params
+{
+    /// [in] equal to CUpti_SassMetricsDisable_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection will be disabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] Num of dropped SASS records will be equal to numOfPatchedInstructions * numOfInstances.
+    /// Number of dropped records will be zero when data is flushed prior to calling the disable API.
+    size_t numOfDroppedRecords;
+} CUpti_SassMetricsDisable_Params;
+#define CUpti_SassMetricsDisable_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsDisable_Params, numOfDroppedRecords)
+
+/**
+ * \brief SASS metric data collection disable API will mark the end of a range, any kernel launched after this
+ * API call will not be profiled for the SASS metrics.
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsDisable_Params
+ * 
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_CONTEXT if any cuda context has not been created prior to this API call
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called multiple times for a cuda context without calling 
+ * cuptiSassMetricsEnable() API or called before cuptiSassMetricsSetConfig() API call.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsDisable(CUpti_SassMetricsDisable_Params* pParams);
+
+/**
+ * \brief Params for cuptiSassMetricsGetDataProperties
+ */
+typedef struct CUpti_SassMetricsGetDataProperties_Params
+{
+    /// [in] equal to CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [out] total number of SASS records has been collected
+    size_t numOfPatchedInstructionRecords;
+    /// [out] number of instances for each metric value per instruction.
+    /// This will depend on CUpti_SassPatching_OutputGranularity level set for the metric config.
+    size_t numOfInstances;
+} CUpti_SassMetricsGetDataProperties_Params;
+
+#define CUpti_SassMetricsGetDataProperties_Params_STRUCT_SIZE           CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsGetDataProperties_Params, numOfInstances)
+/**
+ * \brief SASS metric data properties API will give the data regarding number of instances of a metric
+ * value and number of SASS instruction data has been collected. The number of instances of a metric
+ * will vary as per user set the output granularity level with CUpti_SassMetrics_OutputGranularity value.
+ * User need to allocate memory for retriving the SASS data using cuptiSassMetricsFlushData() API.
+ * 
+ * \param pParams A pointer to \ref CUpti_SassMetricsGetDataProperties_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsGetDataProperties(CUpti_SassMetricsGetDataProperties_Params* pParams);
+
+typedef struct CUpti_SassMetrics_InstanceValue
+{
+    // unique id of the metric
+    uint64_t metricId;
+    // metric value 
+    uint64_t value;
+} CUpti_SassMetrics_InstanceValue;
+#define CUpti_SassMetrics_InstanceValue_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetrics_InstanceValue, value)
+
+typedef struct CUpti_SassMetrics_Data
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [out] Unique cubin id
+    uint32_t cubinCrc;
+    /// [out] function's unique symbol index in the module.
+    uint32_t functionIndex;
+    /// [out] The function name
+    const char* functionName;
+    /// [out] pc offset for the function in a module
+    uint32_t pcOffset;
+    /// [out] array of size equal to number of instances per metric, which contains the metric ID and metric value.
+    CUpti_SassMetrics_InstanceValue* pInstanceValues;
+} CUpti_SassMetrics_Data;
+
+/**
+ * \brief Params for cuptiSassMetricsFlushData
+ */
+typedef struct CUpti_SassMetricsFlushData_Params
+{
+    /// [in] equal to CUpti_SassMetricsFlushData_Params_STRUCT_SIZE
+    size_t structSize;
+    /// [in] assign to NULL
+    void* pPriv;
+    /// [in] CUDA context on which SASS metric data collection was enabled.
+    /// If set NULL, default context will be consider for SASS metric data collection.
+    CUcontext ctx;
+    /// [in] number of patched instruction record will be retrived, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of records available.
+    size_t numOfPatchedInstructionRecords;
+    /// [in] number of patched instruction record instances for a metric, user can call cuptiSassMetricsGetDataProperties()
+    /// for getting total number of instances for each record per metric available.
+    size_t numOfInstances;
+    /// [out] 
+    CUpti_SassMetrics_Data* pMetricsData;
+} CUpti_SassMetricsFlushData_Params;
+#define CUpti_SassMetricsFlushData_Params_STRUCT_SIZE                      CUPTI_PROFILER_STRUCT_SIZE(CUpti_SassMetricsFlushData_Params, numOfInstances)
+
+/**
+ * \brief Flush SASS metrics data from CUPTI internal buffer to the user buffer.
+ * User needs to allocate the buffer for retrieving the data. The number of records collected
+ * can be queried using the API cuptiSassMetricsGetDataProperties().
+ *
+ * \param pParams A pointer to \ref CUpti_SassMetricsFlushData_Params
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device doesn't support SASS metric data collection.
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called outside the enable/disable range.
+ */
+CUptiResult CUPTIAPI cuptiSassMetricsFlushData(CUpti_SassMetricsFlushData_Params* pParams);
+
+/** @} */ /* END CUPTI_SASS_METRICS_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // _CUPTI_SASS_METRICS_H_
\ No newline at end of file
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b625d45c65288fa2ea7dc05819ee4dfc4cbdd3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_target.h
@@ -0,0 +1,43 @@
+#if !defined(_CUPTI_TARGET_H_)
+#define _CUPTI_TARGET_H_
+
+/*
+CUPTI profiler target API's
+This file contains the CUPTI profiling API's.
+*/
+#include <cupti_result.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+#ifndef CUPTI_PROFILER_STRUCT_SIZE
+#define CUPTI_PROFILER_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+
+typedef struct CUpti_Device_GetChipName_Params
+{
+    size_t structSize;                                      //!< [in]
+    void* pPriv;                                            //!< [in] assign to NULL
+
+    size_t deviceIndex;                                     //!< [in]
+    const char* pChipName;                                  //!< [out]
+} CUpti_Device_GetChipName_Params;
+
+#define CUpti_Device_GetChipName_Params_STRUCT_SIZE                  CUPTI_PROFILER_STRUCT_SIZE(CUpti_Device_GetChipName_Params, pChipName)
+CUptiResult CUPTIAPI cuptiDeviceGetChipName(CUpti_Device_GetChipName_Params *pParams);
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5f1f281202308712652cb0fe5e07448e9c74c9d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/cupti_version.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright 2010-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(_CUPTI_VERSION_H_)
+#define _CUPTI_VERSION_H_
+
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+/**
+ * \defgroup CUPTI_VERSION_API CUPTI Version
+ * Function and macro to determine the CUPTI version.
+ * @{
+ */
+
+/**
+ * \brief The API version for this implementation of CUPTI.
+ *
+ * The API version for this implementation of CUPTI. This define along
+ * with \ref cuptiGetVersion can be used to dynamically detect if the
+ * version of CUPTI compiled against matches the version of the loaded
+ * CUPTI library.
+ *
+ * v1 : CUDAToolsSDK 4.0
+ * v2 : CUDAToolsSDK 4.1
+ * v3 : CUDA Toolkit 5.0
+ * v4 : CUDA Toolkit 5.5
+ * v5 : CUDA Toolkit 6.0
+ * v6 : CUDA Toolkit 6.5
+ * v7 : CUDA Toolkit 6.5(with sm_52 support)
+ * v8 : CUDA Toolkit 7.0
+ * v9 : CUDA Toolkit 8.0
+ * v10 : CUDA Toolkit 9.0
+ * v11 : CUDA Toolkit 9.1
+ * v12 : CUDA Toolkit 10.0, 10.1 and 10.2
+ * v13 : CUDA Toolkit 11.0
+ * v14 : CUDA Toolkit 11.1
+ * v15 : CUDA Toolkit 11.2, 11.3 and 11.4
+ * v16 : CUDA Toolkit 11.5
+ * v17 : CUDA Toolkit 11.6
+ * v18 : CUDA Toolkit 11.8
+ * v19 : CUDA Toolkit 12.0
+ * v20 : CUDA Toolkit 12.2
+ * v21 : CUDA Toolkit 12.3
+ * v22 : CUDA Toolkit 12.4
+ */
+#define CUPTI_API_VERSION 22
+
+/**
+ * \brief Get the CUPTI API version.
+ *
+ * Return the API version in \p *version.
+ *
+ * \param version Returns the version
+ *
+ * \retval CUPTI_SUCCESS on success
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p version is NULL
+ * \sa CUPTI_API_VERSION
+ */
+CUptiResult CUPTIAPI cuptiGetVersion(uint32_t *version);
+
+/** @} */ /* END CUPTI_VERSION_API */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /*_CUPTI_VERSION_H_*/
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a52e194b265d32f61d47bd3081f4958755bff46
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaGL_meta.h
@@ -0,0 +1,116 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// Dependent includes
+#ifdef __APPLE__
+#include <OpenGL/gl.h>
+#else
+#include <GL/gl.h>
+#endif
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaGL.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGraphicsGLRegisterBuffer_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint buffer;
+    unsigned int Flags;
+} cuGraphicsGLRegisterBuffer_params;
+
+typedef struct cuGraphicsGLRegisterImage_params_st {
+    CUgraphicsResource *pCudaResource;
+    GLuint image;
+    GLenum target;
+    unsigned int Flags;
+} cuGraphicsGLRegisterImage_params;
+
+typedef struct cuGLGetDevices_v2_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_v2_params;
+
+typedef struct cuGLCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_v2_params;
+
+typedef struct cuGLRegisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLRegisterBufferObject_params;
+
+typedef struct cuGLMapBufferObject_v2_ptds_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_ptds_params;
+
+typedef struct cuGLUnmapBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnmapBufferObject_params;
+
+typedef struct cuGLUnregisterBufferObject_params_st {
+    GLuint buffer;
+} cuGLUnregisterBufferObject_params;
+
+typedef struct cuGLSetBufferObjectMapFlags_params_st {
+    GLuint buffer;
+    unsigned int Flags;
+} cuGLSetBufferObjectMapFlags_params;
+
+typedef struct cuGLMapBufferObjectAsync_v2_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_ptsz_params;
+
+typedef struct cuGLUnmapBufferObjectAsync_params_st {
+    GLuint buffer;
+    CUstream hStream;
+} cuGLUnmapBufferObjectAsync_params;
+
+typedef struct cuGLGetDevices_params_st {
+    unsigned int *pCudaDeviceCount;
+    CUdevice *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    CUGLDeviceList deviceList;
+} cuGLGetDevices_params;
+
+typedef struct cuGLMapBufferObject_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+} cuGLMapBufferObject_v2_params;
+
+typedef struct cuGLMapBufferObjectAsync_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_v2_params;
+
+typedef struct cuGLCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int Flags;
+    CUdevice device;
+} cuGLCtxCreate_params;
+
+typedef struct cuGLMapBufferObject_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+} cuGLMapBufferObject_params;
+
+typedef struct cuGLMapBufferObjectAsync_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *size;
+    GLuint buffer;
+    CUstream hStream;
+} cuGLMapBufferObjectAsync_params;
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..abc603c8d9be21e012a9b1641330c2e203d623b2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudaVDPAU_meta.h
@@ -0,0 +1,46 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// Dependent includes
+#include <vdpau/vdpau.h>
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaVDPAU.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuVDPAUGetDevice_params_st {
+    CUdevice *pDevice;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUGetDevice_params;
+
+typedef struct cuVDPAUCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_v2_params;
+
+typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterVideoSurface_params;
+
+typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterOutputSurface_params;
+
+typedef struct cuVDPAUCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_params;
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaba3ac5a760e338f1edc191609f6fa2a32adee7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_gl_interop_meta.h
@@ -0,0 +1,71 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_gl_interop.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaGLGetDevices_v4010_params_st {
+    unsigned int *pCudaDeviceCount;
+    int *pCudaDevices;
+    unsigned int cudaDeviceCount;
+    enum cudaGLDeviceList deviceList;
+} cudaGLGetDevices_v4010_params;
+
+typedef struct cudaGraphicsGLRegisterImage_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    GLuint image;
+    GLenum target;
+    unsigned int flags;
+} cudaGraphicsGLRegisterImage_v3020_params;
+
+typedef struct cudaGraphicsGLRegisterBuffer_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    GLuint buffer;
+    unsigned int flags;
+} cudaGraphicsGLRegisterBuffer_v3020_params;
+
+typedef struct cudaGLSetGLDevice_v3020_params_st {
+    int device;
+} cudaGLSetGLDevice_v3020_params;
+
+typedef struct cudaGLRegisterBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLRegisterBufferObject_v3020_params;
+
+typedef struct cudaGLMapBufferObject_v3020_params_st {
+    void **devPtr;
+    GLuint bufObj;
+} cudaGLMapBufferObject_v3020_params;
+
+typedef struct cudaGLUnmapBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLUnmapBufferObject_v3020_params;
+
+typedef struct cudaGLUnregisterBufferObject_v3020_params_st {
+    GLuint bufObj;
+} cudaGLUnregisterBufferObject_v3020_params;
+
+typedef struct cudaGLSetBufferObjectMapFlags_v3020_params_st {
+    GLuint bufObj;
+    unsigned int flags;
+} cudaGLSetBufferObjectMapFlags_v3020_params;
+
+typedef struct cudaGLMapBufferObjectAsync_v3020_params_st {
+    void **devPtr;
+    GLuint bufObj;
+    cudaStream_t stream;
+} cudaGLMapBufferObjectAsync_v3020_params;
+
+typedef struct cudaGLUnmapBufferObjectAsync_v3020_params_st {
+    GLuint bufObj;
+    cudaStream_t stream;
+} cudaGLUnmapBufferObjectAsync_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea4a318decd6c18ffa1ad7d761f0398fd136ccaf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_meta.h
@@ -0,0 +1,3563 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// No dependent includes
+
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cuda.h"
+
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct cuGetErrorString_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorString_params;
+
+typedef struct cuGetErrorName_params_st {
+    CUresult error;
+    const char **pStr;
+} cuGetErrorName_params;
+
+typedef struct cuInit_params_st {
+    unsigned int Flags;
+} cuInit_params;
+
+typedef struct cuDriverGetVersion_params_st {
+    int *driverVersion;
+} cuDriverGetVersion_params;
+
+typedef struct cuDeviceGet_params_st {
+    CUdevice *device;
+    int ordinal;
+} cuDeviceGet_params;
+
+typedef struct cuDeviceGetCount_params_st {
+    int *count;
+} cuDeviceGetCount_params;
+
+typedef struct cuDeviceGetName_params_st {
+    char *name;
+    int len;
+    CUdevice dev;
+} cuDeviceGetName_params;
+
+typedef struct cuDeviceGetUuid_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_params;
+
+typedef struct cuDeviceGetUuid_v2_params_st {
+    CUuuid *uuid;
+    CUdevice dev;
+} cuDeviceGetUuid_v2_params;
+
+typedef struct cuDeviceGetLuid_params_st {
+    char *luid;
+    unsigned int *deviceNodeMask;
+    CUdevice dev;
+} cuDeviceGetLuid_params;
+
+typedef struct cuDeviceTotalMem_v2_params_st {
+    size_t *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_v2_params;
+
+typedef struct cuDeviceGetTexture1DLinearMaxWidth_params_st {
+    size_t *maxWidthInElements;
+    CUarray_format format;
+    unsigned numChannels;
+    CUdevice dev;
+} cuDeviceGetTexture1DLinearMaxWidth_params;
+
+typedef struct cuDeviceGetAttribute_params_st {
+    int *pi;
+    CUdevice_attribute attrib;
+    CUdevice dev;
+} cuDeviceGetAttribute_params;
+
+typedef struct cuDeviceGetNvSciSyncAttributes_params_st {
+    void *nvSciSyncAttrList;
+    CUdevice dev;
+    int flags;
+} cuDeviceGetNvSciSyncAttributes_params;
+
+typedef struct cuDeviceSetMemPool_params_st {
+    CUdevice dev;
+    CUmemoryPool pool;
+} cuDeviceSetMemPool_params;
+
+typedef struct cuDeviceGetMemPool_params_st {
+    CUmemoryPool *pool;
+    CUdevice dev;
+} cuDeviceGetMemPool_params;
+
+typedef struct cuDeviceGetDefaultMemPool_params_st {
+    CUmemoryPool *pool_out;
+    CUdevice dev;
+} cuDeviceGetDefaultMemPool_params;
+
+typedef struct cuDeviceGetExecAffinitySupport_params_st {
+    int *pi;
+    CUexecAffinityType type;
+    CUdevice dev;
+} cuDeviceGetExecAffinitySupport_params;
+
+typedef struct cuFlushGPUDirectRDMAWrites_params_st {
+    CUflushGPUDirectRDMAWritesTarget target;
+    CUflushGPUDirectRDMAWritesScope scope;
+} cuFlushGPUDirectRDMAWrites_params;
+
+typedef struct cuDeviceGetProperties_params_st {
+    CUdevprop *prop;
+    CUdevice dev;
+} cuDeviceGetProperties_params;
+
+typedef struct cuDeviceComputeCapability_params_st {
+    int *major;
+    int *minor;
+    CUdevice dev;
+} cuDeviceComputeCapability_params;
+
+typedef struct cuDevicePrimaryCtxRetain_params_st {
+    CUcontext *pctx;
+    CUdevice dev;
+} cuDevicePrimaryCtxRetain_params;
+
+typedef struct cuDevicePrimaryCtxRelease_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_v2_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_v2_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_v2_params;
+
+typedef struct cuDevicePrimaryCtxGetState_params_st {
+    CUdevice dev;
+    unsigned int *flags;
+    int *active;
+} cuDevicePrimaryCtxGetState_params;
+
+typedef struct cuDevicePrimaryCtxReset_v2_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_v2_params;
+
+typedef struct cuCtxCreate_v2_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v2_params;
+
+typedef struct cuCtxCreate_v3_params_st {
+    CUcontext *pctx;
+    CUexecAffinityParam *paramsArray;
+    int numParams;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_v3_params;
+
+typedef struct cuCtxDestroy_v2_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_v2_params;
+
+typedef struct cuCtxPushCurrent_v2_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_v2_params;
+
+typedef struct cuCtxPopCurrent_v2_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_v2_params;
+
+typedef struct cuCtxSetCurrent_params_st {
+    CUcontext ctx;
+} cuCtxSetCurrent_params;
+
+typedef struct cuCtxGetCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxGetCurrent_params;
+
+typedef struct cuCtxGetDevice_params_st {
+    CUdevice *device;
+} cuCtxGetDevice_params;
+
+typedef struct cuCtxGetFlags_params_st {
+    unsigned int *flags;
+} cuCtxGetFlags_params;
+
+typedef struct cuCtxSetFlags_params_st {
+    unsigned int flags;
+} cuCtxSetFlags_params;
+
+typedef struct cuCtxGetId_params_st {
+    CUcontext ctx;
+    unsigned long long *ctxId;
+} cuCtxGetId_params;
+
+typedef struct cuCtxSetLimit_params_st {
+    CUlimit limit;
+    size_t value;
+} cuCtxSetLimit_params;
+
+typedef struct cuCtxGetLimit_params_st {
+    size_t *pvalue;
+    CUlimit limit;
+} cuCtxGetLimit_params;
+
+typedef struct cuCtxGetCacheConfig_params_st {
+    CUfunc_cache *pconfig;
+} cuCtxGetCacheConfig_params;
+
+typedef struct cuCtxSetCacheConfig_params_st {
+    CUfunc_cache config;
+} cuCtxSetCacheConfig_params;
+
+typedef struct cuCtxGetApiVersion_params_st {
+    CUcontext ctx;
+    unsigned int *version;
+} cuCtxGetApiVersion_params;
+
+typedef struct cuCtxGetStreamPriorityRange_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cuCtxGetStreamPriorityRange_params;
+
+typedef struct cuCtxGetExecAffinity_params_st {
+    CUexecAffinityParam *pExecAffinity;
+    CUexecAffinityType type;
+} cuCtxGetExecAffinity_params;
+
+typedef struct cuCtxAttach_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+} cuCtxAttach_params;
+
+typedef struct cuCtxDetach_params_st {
+    CUcontext ctx;
+} cuCtxDetach_params;
+
+typedef struct cuCtxGetSharedMemConfig_params_st {
+    CUsharedconfig *pConfig;
+} cuCtxGetSharedMemConfig_params;
+
+typedef struct cuCtxSetSharedMemConfig_params_st {
+    CUsharedconfig config;
+} cuCtxSetSharedMemConfig_params;
+
+typedef struct cuModuleLoad_params_st {
+    CUmodule *module;
+    const char *fname;
+} cuModuleLoad_params;
+
+typedef struct cuModuleLoadData_params_st {
+    CUmodule *module;
+    const void *image;
+} cuModuleLoadData_params;
+
+typedef struct cuModuleLoadDataEx_params_st {
+    CUmodule *module;
+    const void *image;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuModuleLoadDataEx_params;
+
+typedef struct cuModuleLoadFatBinary_params_st {
+    CUmodule *module;
+    const void *fatCubin;
+} cuModuleLoadFatBinary_params;
+
+typedef struct cuModuleUnload_params_st {
+    CUmodule hmod;
+} cuModuleUnload_params;
+
+typedef struct cuModuleGetLoadingMode_params_st {
+    CUmoduleLoadingMode *mode;
+} cuModuleGetLoadingMode_params;
+
+typedef struct cuModuleGetFunction_params_st {
+    CUfunction *hfunc;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetFunction_params;
+
+typedef struct cuModuleGetFunctionCount_params_st {
+    unsigned int *count;
+    CUmodule mod;
+} cuModuleGetFunctionCount_params;
+
+typedef struct cuModuleEnumerateFunctions_params_st {
+    CUfunction *functions;
+    unsigned int numFunctions;
+    CUmodule mod;
+} cuModuleEnumerateFunctions_params;
+
+typedef struct cuModuleGetGlobal_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_v2_params;
+
+typedef struct cuLinkCreate_v2_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_v2_params;
+
+typedef struct cuLinkAddData_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_v2_params;
+
+typedef struct cuLinkAddFile_v2_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_v2_params;
+
+typedef struct cuLinkComplete_params_st {
+    CUlinkState state;
+    void **cubinOut;
+    size_t *sizeOut;
+} cuLinkComplete_params;
+
+typedef struct cuLinkDestroy_params_st {
+    CUlinkState state;
+} cuLinkDestroy_params;
+
+typedef struct cuModuleGetTexRef_params_st {
+    CUtexref *pTexRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetTexRef_params;
+
+typedef struct cuModuleGetSurfRef_params_st {
+    CUsurfref *pSurfRef;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetSurfRef_params;
+
+typedef struct cuLibraryLoadData_params_st {
+    CUlibrary *library;
+    const void *code;
+    CUjit_option *jitOptions;
+    void **jitOptionsValues;
+    unsigned int numJitOptions;
+    CUlibraryOption *libraryOptions;
+    void **libraryOptionValues;
+    unsigned int numLibraryOptions;
+} cuLibraryLoadData_params;
+
+typedef struct cuLibraryLoadFromFile_params_st {
+    CUlibrary *library;
+    const char *fileName;
+    CUjit_option *jitOptions;
+    void **jitOptionsValues;
+    unsigned int numJitOptions;
+    CUlibraryOption *libraryOptions;
+    void **libraryOptionValues;
+    unsigned int numLibraryOptions;
+} cuLibraryLoadFromFile_params;
+
+typedef struct cuLibraryUnload_params_st {
+    CUlibrary library;
+} cuLibraryUnload_params;
+
+typedef struct cuLibraryGetKernel_params_st {
+    CUkernel *pKernel;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetKernel_params;
+
+typedef struct cuLibraryGetKernelCount_params_st {
+    unsigned int *count;
+    CUlibrary lib;
+} cuLibraryGetKernelCount_params;
+
+typedef struct cuLibraryEnumerateKernels_params_st {
+    CUkernel *kernels;
+    unsigned int numKernels;
+    CUlibrary lib;
+} cuLibraryEnumerateKernels_params;
+
+typedef struct cuLibraryGetModule_params_st {
+    CUmodule *pMod;
+    CUlibrary library;
+} cuLibraryGetModule_params;
+
+typedef struct cuKernelGetFunction_params_st {
+    CUfunction *pFunc;
+    CUkernel kernel;
+} cuKernelGetFunction_params;
+
+typedef struct cuLibraryGetGlobal_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetGlobal_params;
+
+typedef struct cuLibraryGetManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t *bytes;
+    CUlibrary library;
+    const char *name;
+} cuLibraryGetManaged_params;
+
+typedef struct cuLibraryGetUnifiedFunction_params_st {
+    void **fptr;
+    CUlibrary library;
+    const char *symbol;
+} cuLibraryGetUnifiedFunction_params;
+
+typedef struct cuKernelGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUkernel kernel;
+    CUdevice dev;
+} cuKernelGetAttribute_params;
+
+typedef struct cuKernelSetAttribute_params_st {
+    CUfunction_attribute attrib;
+    int val;
+    CUkernel kernel;
+    CUdevice dev;
+} cuKernelSetAttribute_params;
+
+typedef struct cuKernelSetCacheConfig_params_st {
+    CUkernel kernel;
+    CUfunc_cache config;
+    CUdevice dev;
+} cuKernelSetCacheConfig_params;
+
+typedef struct cuKernelGetName_params_st {
+    const char **name;
+    CUkernel hfunc;
+} cuKernelGetName_params;
+
+typedef struct cuKernelGetParamInfo_params_st {
+    CUkernel kernel;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cuKernelGetParamInfo_params;
+
+typedef struct cuMemGetInfo_v2_params_st {
+    size_t *free;
+    size_t *total;
+} cuMemGetInfo_v2_params;
+
+typedef struct cuMemAlloc_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+} cuMemAlloc_v2_params;
+
+typedef struct cuMemAllocPitch_v2_params_st {
+    CUdeviceptr *dptr;
+    size_t *pPitch;
+    size_t WidthInBytes;
+    size_t Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_v2_params;
+
+typedef struct cuMemFree_v2_params_st {
+    CUdeviceptr dptr;
+} cuMemFree_v2_params;
+
+typedef struct cuMemGetAddressRange_v2_params_st {
+    CUdeviceptr *pbase;
+    size_t *psize;
+    CUdeviceptr dptr;
+} cuMemGetAddressRange_v2_params;
+
+typedef struct cuMemAllocHost_v2_params_st {
+    void **pp;
+    size_t bytesize;
+} cuMemAllocHost_v2_params;
+
+typedef struct cuMemFreeHost_params_st {
+    void *p;
+} cuMemFreeHost_params;
+
+typedef struct cuMemHostAlloc_params_st {
+    void **pp;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostAlloc_params;
+
+typedef struct cuMemHostGetDevicePointer_v2_params_st {
+    CUdeviceptr *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_v2_params;
+
+typedef struct cuMemHostGetFlags_params_st {
+    unsigned int *pFlags;
+    void *p;
+} cuMemHostGetFlags_params;
+
+typedef struct cuMemAllocManaged_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    unsigned int flags;
+} cuMemAllocManaged_params;
+
+typedef struct cuDeviceGetByPCIBusId_params_st {
+    CUdevice *dev;
+    const char *pciBusId;
+} cuDeviceGetByPCIBusId_params;
+
+typedef struct cuDeviceGetPCIBusId_params_st {
+    char *pciBusId;
+    int len;
+    CUdevice dev;
+} cuDeviceGetPCIBusId_params;
+
+typedef struct cuIpcGetEventHandle_params_st {
+    CUipcEventHandle *pHandle;
+    CUevent event;
+} cuIpcGetEventHandle_params;
+
+typedef struct cuIpcOpenEventHandle_params_st {
+    CUevent *phEvent;
+    CUipcEventHandle handle;
+} cuIpcOpenEventHandle_params;
+
+typedef struct cuIpcGetMemHandle_params_st {
+    CUipcMemHandle *pHandle;
+    CUdeviceptr dptr;
+} cuIpcGetMemHandle_params;
+
+typedef struct cuIpcOpenMemHandle_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_v2_params;
+
+typedef struct cuIpcCloseMemHandle_params_st {
+    CUdeviceptr dptr;
+} cuIpcCloseMemHandle_params;
+
+typedef struct cuMemHostRegister_v2_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_v2_params;
+
+typedef struct cuMemHostUnregister_params_st {
+    void *p;
+} cuMemHostUnregister_params;
+
+typedef struct cuMemcpy_ptds_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_ptds_params;
+
+typedef struct cuMemcpyPeer_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_ptds_params;
+
+typedef struct cuMemcpyHtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_ptds_params;
+
+typedef struct cuMemcpyDtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_ptds_params;
+
+typedef struct cuMemcpyDtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoD_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_ptds_params;
+
+typedef struct cuMemcpyHtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_ptds_params;
+
+typedef struct cuMemcpyAtoH_v2_ptds_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_ptds_params;
+
+typedef struct cuMemcpyAtoA_v2_ptds_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_ptds_params;
+
+typedef struct cuMemcpy2D_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_ptds_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_ptds_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_ptds_params;
+
+typedef struct cuMemcpy3D_v2_ptds_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_ptds_params;
+
+typedef struct cuMemcpy3DPeer_ptds_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_ptds_params;
+
+typedef struct cuMemcpyAsync_ptsz_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_ptsz_params;
+
+typedef struct cuMemcpyPeerAsync_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_ptsz_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_ptsz_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_ptsz_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_ptsz_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy2DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DAsync_v2_ptsz_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_ptsz_params;
+
+typedef struct cuMemcpy3DPeerAsync_ptsz_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_ptsz_params;
+
+typedef struct cuMemsetD8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_ptds_params;
+
+typedef struct cuMemsetD16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_ptds_params;
+
+typedef struct cuMemsetD32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_ptds_params;
+
+typedef struct cuMemsetD2D8_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_ptds_params;
+
+typedef struct cuMemsetD2D16_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_ptds_params;
+
+typedef struct cuMemsetD2D32_v2_ptds_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_ptds_params;
+
+typedef struct cuMemsetD8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_ptsz_params;
+
+typedef struct cuMemsetD16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_ptsz_params;
+
+typedef struct cuMemsetD32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_ptsz_params;
+
+typedef struct cuMemsetD2D8Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_ptsz_params;
+
+typedef struct cuMemsetD2D16Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_ptsz_params;
+
+typedef struct cuMemsetD2D32Async_ptsz_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_ptsz_params;
+
+typedef struct cuArrayCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR *pAllocateArray;
+} cuArrayCreate_v2_params;
+
+typedef struct cuArrayGetDescriptor_v2_params_st {
+    CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_v2_params;
+
+typedef struct cuArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUarray array;
+} cuArrayGetSparseProperties_params;
+
+typedef struct cuMipmappedArrayGetSparseProperties_params_st {
+    CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties;
+    CUmipmappedArray mipmap;
+} cuMipmappedArrayGetSparseProperties_params;
+
+typedef struct cuArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUarray array;
+    CUdevice device;
+} cuArrayGetMemoryRequirements_params;
+
+typedef struct cuMipmappedArrayGetMemoryRequirements_params_st {
+    CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements;
+    CUmipmappedArray mipmap;
+    CUdevice device;
+} cuMipmappedArrayGetMemoryRequirements_params;
+
+typedef struct cuArrayGetPlane_params_st {
+    CUarray *pPlaneArray;
+    CUarray hArray;
+    unsigned int planeIdx;
+} cuArrayGetPlane_params;
+
+typedef struct cuArrayDestroy_params_st {
+    CUarray hArray;
+} cuArrayDestroy_params;
+
+typedef struct cuArray3DCreate_v2_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray;
+} cuArray3DCreate_v2_params;
+
+typedef struct cuArray3DGetDescriptor_v2_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_v2_params;
+
+typedef struct cuMipmappedArrayCreate_params_st {
+    CUmipmappedArray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc;
+    unsigned int numMipmapLevels;
+} cuMipmappedArrayCreate_params;
+
+typedef struct cuMipmappedArrayGetLevel_params_st {
+    CUarray *pLevelArray;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int level;
+} cuMipmappedArrayGetLevel_params;
+
+typedef struct cuMipmappedArrayDestroy_params_st {
+    CUmipmappedArray hMipmappedArray;
+} cuMipmappedArrayDestroy_params;
+
+typedef struct cuMemGetHandleForAddressRange_params_st {
+    void *handle;
+    CUdeviceptr dptr;
+    size_t size;
+    CUmemRangeHandleType handleType;
+    unsigned long long flags;
+} cuMemGetHandleForAddressRange_params;
+
+typedef struct cuMemAddressReserve_params_st {
+    CUdeviceptr *ptr;
+    size_t size;
+    size_t alignment;
+    CUdeviceptr addr;
+    unsigned long long flags;
+} cuMemAddressReserve_params;
+
+typedef struct cuMemAddressFree_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemAddressFree_params;
+
+typedef struct cuMemCreate_params_st {
+    CUmemGenericAllocationHandle *handle;
+    size_t size;
+    const CUmemAllocationProp *prop;
+    unsigned long long flags;
+} cuMemCreate_params;
+
+typedef struct cuMemRelease_params_st {
+    CUmemGenericAllocationHandle handle;
+} cuMemRelease_params;
+
+typedef struct cuMemMap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    size_t offset;
+    CUmemGenericAllocationHandle handle;
+    unsigned long long flags;
+} cuMemMap_params;
+
+typedef struct cuMemMapArrayAsync_ptsz_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_ptsz_params;
+
+typedef struct cuMemUnmap_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+} cuMemUnmap_params;
+
+typedef struct cuMemSetAccess_params_st {
+    CUdeviceptr ptr;
+    size_t size;
+    const CUmemAccessDesc *desc;
+    size_t count;
+} cuMemSetAccess_params;
+
+typedef struct cuMemGetAccess_params_st {
+    unsigned long long *flags;
+    const CUmemLocation *location;
+    CUdeviceptr ptr;
+} cuMemGetAccess_params;
+
+typedef struct cuMemExportToShareableHandle_params_st {
+    void *shareableHandle;
+    CUmemGenericAllocationHandle handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemExportToShareableHandle_params;
+
+typedef struct cuMemImportFromShareableHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *osHandle;
+    CUmemAllocationHandleType shHandleType;
+} cuMemImportFromShareableHandle_params;
+
+typedef struct cuMemGetAllocationGranularity_params_st {
+    size_t *granularity;
+    const CUmemAllocationProp *prop;
+    CUmemAllocationGranularity_flags option;
+} cuMemGetAllocationGranularity_params;
+
+typedef struct cuMemGetAllocationPropertiesFromHandle_params_st {
+    CUmemAllocationProp *prop;
+    CUmemGenericAllocationHandle handle;
+} cuMemGetAllocationPropertiesFromHandle_params;
+
+typedef struct cuMemRetainAllocationHandle_params_st {
+    CUmemGenericAllocationHandle *handle;
+    void *addr;
+} cuMemRetainAllocationHandle_params;
+
+typedef struct cuMemFreeAsync_ptsz_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_ptsz_params;
+
+typedef struct cuMemAllocAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_ptsz_params;
+
+typedef struct cuMemPoolTrimTo_params_st {
+    CUmemoryPool pool;
+    size_t minBytesToKeep;
+} cuMemPoolTrimTo_params;
+
+typedef struct cuMemPoolSetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolSetAttribute_params;
+
+typedef struct cuMemPoolGetAttribute_params_st {
+    CUmemoryPool pool;
+    CUmemPool_attribute attr;
+    void *value;
+} cuMemPoolGetAttribute_params;
+
+typedef struct cuMemPoolSetAccess_params_st {
+    CUmemoryPool pool;
+    const CUmemAccessDesc *map;
+    size_t count;
+} cuMemPoolSetAccess_params;
+
+typedef struct cuMemPoolGetAccess_params_st {
+    CUmemAccess_flags *flags;
+    CUmemoryPool memPool;
+    CUmemLocation *location;
+} cuMemPoolGetAccess_params;
+
+typedef struct cuMemPoolCreate_params_st {
+    CUmemoryPool *pool;
+    const CUmemPoolProps *poolProps;
+} cuMemPoolCreate_params;
+
+typedef struct cuMemPoolDestroy_params_st {
+    CUmemoryPool pool;
+} cuMemPoolDestroy_params;
+
+typedef struct cuMemAllocFromPoolAsync_ptsz_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_ptsz_params;
+
+typedef struct cuMemPoolExportToShareableHandle_params_st {
+    void *handle_out;
+    CUmemoryPool pool;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolExportToShareableHandle_params;
+
+typedef struct cuMemPoolImportFromShareableHandle_params_st {
+    CUmemoryPool *pool_out;
+    void *handle;
+    CUmemAllocationHandleType handleType;
+    unsigned long long flags;
+} cuMemPoolImportFromShareableHandle_params;
+
+typedef struct cuMemPoolExportPointer_params_st {
+    CUmemPoolPtrExportData *shareData_out;
+    CUdeviceptr ptr;
+} cuMemPoolExportPointer_params;
+
+typedef struct cuMemPoolImportPointer_params_st {
+    CUdeviceptr *ptr_out;
+    CUmemoryPool pool;
+    CUmemPoolPtrExportData *shareData;
+} cuMemPoolImportPointer_params;
+
+typedef struct cuMulticastCreate_params_st {
+    CUmemGenericAllocationHandle *mcHandle;
+    const CUmulticastObjectProp *prop;
+} cuMulticastCreate_params;
+
+typedef struct cuMulticastAddDevice_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    CUdevice dev;
+} cuMulticastAddDevice_params;
+
+typedef struct cuMulticastBindMem_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    size_t mcOffset;
+    CUmemGenericAllocationHandle memHandle;
+    size_t memOffset;
+    size_t size;
+    unsigned long long flags;
+} cuMulticastBindMem_params;
+
+typedef struct cuMulticastBindAddr_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    size_t mcOffset;
+    CUdeviceptr memptr;
+    size_t size;
+    unsigned long long flags;
+} cuMulticastBindAddr_params;
+
+typedef struct cuMulticastUnbind_params_st {
+    CUmemGenericAllocationHandle mcHandle;
+    CUdevice dev;
+    size_t mcOffset;
+    size_t size;
+} cuMulticastUnbind_params;
+
+typedef struct cuMulticastGetGranularity_params_st {
+    size_t *granularity;
+    const CUmulticastObjectProp *prop;
+    CUmulticastGranularity_flags option;
+} cuMulticastGetGranularity_params;
+
+typedef struct cuPointerGetAttribute_params_st {
+    void *data;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerGetAttribute_params;
+
+typedef struct cuMemPrefetchAsync_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_ptsz_params;
+
+typedef struct cuMemPrefetchAsync_v2_ptsz_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmemLocation location;
+    unsigned int flags;
+    CUstream hStream;
+} cuMemPrefetchAsync_v2_ptsz_params;
+
+typedef struct cuMemAdvise_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUdevice device;
+} cuMemAdvise_params;
+
+typedef struct cuMemAdvise_v2_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmem_advise advice;
+    CUmemLocation location;
+} cuMemAdvise_v2_params;
+
+typedef struct cuMemRangeGetAttribute_params_st {
+    void *data;
+    size_t dataSize;
+    CUmem_range_attribute attribute;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttribute_params;
+
+typedef struct cuMemRangeGetAttributes_params_st {
+    void **data;
+    size_t *dataSizes;
+    CUmem_range_attribute *attributes;
+    size_t numAttributes;
+    CUdeviceptr devPtr;
+    size_t count;
+} cuMemRangeGetAttributes_params;
+
+typedef struct cuPointerSetAttribute_params_st {
+    const void *value;
+    CUpointer_attribute attribute;
+    CUdeviceptr ptr;
+} cuPointerSetAttribute_params;
+
+typedef struct cuPointerGetAttributes_params_st {
+    unsigned int numAttributes;
+    CUpointer_attribute *attributes;
+    void **data;
+    CUdeviceptr ptr;
+} cuPointerGetAttributes_params;
+
+typedef struct cuStreamCreate_params_st {
+    CUstream *phStream;
+    unsigned int Flags;
+} cuStreamCreate_params;
+
+typedef struct cuStreamCreateWithPriority_params_st {
+    CUstream *phStream;
+    unsigned int flags;
+    int priority;
+} cuStreamCreateWithPriority_params;
+
+typedef struct cuStreamGetPriority_ptsz_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_ptsz_params;
+
+typedef struct cuStreamGetFlags_ptsz_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_ptsz_params;
+
+typedef struct cuStreamGetId_ptsz_params_st {
+    CUstream hStream;
+    unsigned long long *streamId;
+} cuStreamGetId_ptsz_params;
+
+typedef struct cuStreamGetCtx_ptsz_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_ptsz_params;
+
+typedef struct cuStreamWaitEvent_ptsz_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_ptsz_params;
+
+typedef struct cuStreamAddCallback_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_ptsz_params;
+
+typedef struct cuStreamBeginCaptureToGraph_ptsz_params_st {
+    CUstream hStream;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCaptureToGraph_ptsz_params;
+
+typedef struct cuThreadExchangeStreamCaptureMode_params_st {
+    CUstreamCaptureMode *mode;
+} cuThreadExchangeStreamCaptureMode_params;
+
+typedef struct cuStreamEndCapture_ptsz_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_ptsz_params;
+
+typedef struct cuStreamIsCapturing_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v3_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    const CUgraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v3_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_ptsz_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_v2_ptsz_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_v2_ptsz_params;
+
+typedef struct cuStreamAttachMemAsync_ptsz_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_ptsz_params;
+
+typedef struct cuStreamQuery_ptsz_params_st {
+    CUstream hStream;
+} cuStreamQuery_ptsz_params;
+
+typedef struct cuStreamSynchronize_ptsz_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_ptsz_params;
+
+typedef struct cuStreamDestroy_v2_params_st {
+    CUstream hStream;
+} cuStreamDestroy_v2_params;
+
+typedef struct cuStreamCopyAttributes_ptsz_params_st {
+    CUstream dst;
+    CUstream src;
+} cuStreamCopyAttributes_ptsz_params;
+
+typedef struct cuStreamGetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value_out;
+} cuStreamGetAttribute_ptsz_params;
+
+typedef struct cuStreamSetAttribute_ptsz_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *value;
+} cuStreamSetAttribute_ptsz_params;
+
+typedef struct cuEventCreate_params_st {
+    CUevent *phEvent;
+    unsigned int Flags;
+} cuEventCreate_params;
+
+typedef struct cuEventRecord_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_ptsz_params;
+
+typedef struct cuEventRecordWithFlags_ptsz_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_ptsz_params;
+
+typedef struct cuEventQuery_params_st {
+    CUevent hEvent;
+} cuEventQuery_params;
+
+typedef struct cuEventSynchronize_params_st {
+    CUevent hEvent;
+} cuEventSynchronize_params;
+
+typedef struct cuEventDestroy_v2_params_st {
+    CUevent hEvent;
+} cuEventDestroy_v2_params;
+
+typedef struct cuEventElapsedTime_params_st {
+    float *pMilliseconds;
+    CUevent hStart;
+    CUevent hEnd;
+} cuEventElapsedTime_params;
+
+typedef struct cuImportExternalMemory_params_st {
+    CUexternalMemory *extMem_out;
+    const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc;
+} cuImportExternalMemory_params;
+
+typedef struct cuExternalMemoryGetMappedBuffer_params_st {
+    CUdeviceptr *devPtr;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc;
+} cuExternalMemoryGetMappedBuffer_params;
+
+typedef struct cuExternalMemoryGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *mipmap;
+    CUexternalMemory extMem;
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc;
+} cuExternalMemoryGetMappedMipmappedArray_params;
+
+typedef struct cuDestroyExternalMemory_params_st {
+    CUexternalMemory extMem;
+} cuDestroyExternalMemory_params;
+
+typedef struct cuImportExternalSemaphore_params_st {
+    CUexternalSemaphore *extSem_out;
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc;
+} cuImportExternalSemaphore_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_ptsz_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_ptsz_params;
+
+typedef struct cuDestroyExternalSemaphore_params_st {
+    CUexternalSemaphore extSem;
+} cuDestroyExternalSemaphore_params;
+
+typedef struct cuStreamWaitValue32_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_v2_ptsz_params;
+
+typedef struct cuStreamWaitValue64_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_v2_ptsz_params;
+
+typedef struct cuStreamWriteValue32_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_v2_ptsz_params;
+
+typedef struct cuStreamWriteValue64_v2_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_v2_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_v2_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_v2_ptsz_params;
+
+typedef struct cuFuncGetAttribute_params_st {
+    int *pi;
+    CUfunction_attribute attrib;
+    CUfunction hfunc;
+} cuFuncGetAttribute_params;
+
+typedef struct cuFuncSetAttribute_params_st {
+    CUfunction hfunc;
+    CUfunction_attribute attrib;
+    int value;
+} cuFuncSetAttribute_params;
+
+typedef struct cuFuncSetCacheConfig_params_st {
+    CUfunction hfunc;
+    CUfunc_cache config;
+} cuFuncSetCacheConfig_params;
+
+typedef struct cuFuncGetModule_params_st {
+    CUmodule *hmod;
+    CUfunction hfunc;
+} cuFuncGetModule_params;
+
+typedef struct cuFuncGetName_params_st {
+    const char **name;
+    CUfunction hfunc;
+} cuFuncGetName_params;
+
+typedef struct cuFuncGetParamInfo_params_st {
+    CUfunction func;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cuFuncGetParamInfo_params;
+
+typedef struct cuFuncIsLoaded_params_st {
+    CUfunctionLoadingState *state;
+    CUfunction function;
+} cuFuncIsLoaded_params;
+
+typedef struct cuFuncLoad_params_st {
+    CUfunction function;
+} cuFuncLoad_params;
+
+typedef struct cuLaunchKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_ptsz_params;
+
+typedef struct cuLaunchKernelEx_ptsz_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernel_ptsz_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_ptsz_params;
+
+typedef struct cuLaunchCooperativeKernelMultiDevice_params_st {
+    CUDA_LAUNCH_PARAMS *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cuLaunchCooperativeKernelMultiDevice_params;
+
+typedef struct cuLaunchHostFunc_ptsz_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_ptsz_params;
+
+typedef struct cuFuncSetBlockShape_params_st {
+    CUfunction hfunc;
+    int x;
+    int y;
+    int z;
+} cuFuncSetBlockShape_params;
+
+typedef struct cuFuncSetSharedSize_params_st {
+    CUfunction hfunc;
+    unsigned int bytes;
+} cuFuncSetSharedSize_params;
+
+typedef struct cuParamSetSize_params_st {
+    CUfunction hfunc;
+    unsigned int numbytes;
+} cuParamSetSize_params;
+
+typedef struct cuParamSeti_params_st {
+    CUfunction hfunc;
+    int offset;
+    unsigned int value;
+} cuParamSeti_params;
+
+typedef struct cuParamSetf_params_st {
+    CUfunction hfunc;
+    int offset;
+    float value;
+} cuParamSetf_params;
+
+typedef struct cuParamSetv_params_st {
+    CUfunction hfunc;
+    int offset;
+    void *ptr;
+    unsigned int numbytes;
+} cuParamSetv_params;
+
+typedef struct cuLaunch_params_st {
+    CUfunction f;
+} cuLaunch_params;
+
+typedef struct cuLaunchGrid_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+} cuLaunchGrid_params;
+
+typedef struct cuLaunchGridAsync_params_st {
+    CUfunction f;
+    int grid_width;
+    int grid_height;
+    CUstream hStream;
+} cuLaunchGridAsync_params;
+
+typedef struct cuParamSetTexRef_params_st {
+    CUfunction hfunc;
+    int texunit;
+    CUtexref hTexRef;
+} cuParamSetTexRef_params;
+
+typedef struct cuFuncSetSharedMemConfig_params_st {
+    CUfunction hfunc;
+    CUsharedconfig config;
+} cuFuncSetSharedMemConfig_params;
+
+typedef struct cuGraphCreate_params_st {
+    CUgraph *phGraph;
+    unsigned int flags;
+} cuGraphCreate_params;
+
+typedef struct cuGraphAddKernelNode_v2_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphAddKernelNode_v2_params;
+
+typedef struct cuGraphKernelNodeGetParams_v2_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeGetParams_v2_params;
+
+typedef struct cuGraphKernelNodeSetParams_v2_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphKernelNodeSetParams_v2_params;
+
+typedef struct cuGraphAddMemcpyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphAddMemcpyNode_params;
+
+typedef struct cuGraphMemcpyNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeGetParams_params;
+
+typedef struct cuGraphMemcpyNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *nodeParams;
+} cuGraphMemcpyNodeSetParams_params;
+
+typedef struct cuGraphAddMemsetNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphAddMemsetNode_params;
+
+typedef struct cuGraphMemsetNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeGetParams_params;
+
+typedef struct cuGraphMemsetNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *nodeParams;
+} cuGraphMemsetNodeSetParams_params;
+
+typedef struct cuGraphAddHostNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphAddHostNode_params;
+
+typedef struct cuGraphHostNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeGetParams_params;
+
+typedef struct cuGraphHostNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphHostNodeSetParams_params;
+
+typedef struct cuGraphAddChildGraphNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraph childGraph;
+} cuGraphAddChildGraphNode_params;
+
+typedef struct cuGraphChildGraphNodeGetGraph_params_st {
+    CUgraphNode hNode;
+    CUgraph *phGraph;
+} cuGraphChildGraphNodeGetGraph_params;
+
+typedef struct cuGraphAddEmptyNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+} cuGraphAddEmptyNode_params;
+
+typedef struct cuGraphAddEventRecordNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventRecordNode_params;
+
+typedef struct cuGraphEventRecordNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventRecordNodeGetEvent_params;
+
+typedef struct cuGraphEventRecordNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphAddEventWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUevent event;
+} cuGraphAddEventWaitNode_params;
+
+typedef struct cuGraphEventWaitNodeGetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent *event_out;
+} cuGraphEventWaitNodeGetEvent_params;
+
+typedef struct cuGraphEventWaitNodeSetEvent_params_st {
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphAddExternalSemaphoresSignalNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresSignalNode_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresSignalNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphAddExternalSemaphoresWaitNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphAddExternalSemaphoresWaitNode_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out;
+} cuGraphExternalSemaphoresWaitNodeGetParams_params;
+
+typedef struct cuGraphExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphAddBatchMemOpNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphAddBatchMemOpNode_params;
+
+typedef struct cuGraphBatchMemOpNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out;
+} cuGraphBatchMemOpNodeGetParams_params;
+
+typedef struct cuGraphBatchMemOpNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphBatchMemOpNodeSetParams_params;
+
+typedef struct cuGraphExecBatchMemOpNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams;
+} cuGraphExecBatchMemOpNodeSetParams_params;
+
+typedef struct cuGraphAddMemAllocNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams;
+} cuGraphAddMemAllocNode_params;
+
+typedef struct cuGraphMemAllocNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_MEM_ALLOC_NODE_PARAMS *params_out;
+} cuGraphMemAllocNodeGetParams_params;
+
+typedef struct cuGraphAddMemFreeNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUdeviceptr dptr;
+} cuGraphAddMemFreeNode_params;
+
+typedef struct cuGraphMemFreeNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUdeviceptr *dptr_out;
+} cuGraphMemFreeNodeGetParams_params;
+
+typedef struct cuDeviceGraphMemTrim_params_st {
+    CUdevice device;
+} cuDeviceGraphMemTrim_params;
+
+typedef struct cuDeviceGetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceGetGraphMemAttribute_params;
+
+typedef struct cuDeviceSetGraphMemAttribute_params_st {
+    CUdevice device;
+    CUgraphMem_attribute attr;
+    void *value;
+} cuDeviceSetGraphMemAttribute_params;
+
+typedef struct cuGraphClone_params_st {
+    CUgraph *phGraphClone;
+    CUgraph originalGraph;
+} cuGraphClone_params;
+
+typedef struct cuGraphNodeFindInClone_params_st {
+    CUgraphNode *phNode;
+    CUgraphNode hOriginalNode;
+    CUgraph hClonedGraph;
+} cuGraphNodeFindInClone_params;
+
+typedef struct cuGraphNodeGetType_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeType *type;
+} cuGraphNodeGetType_params;
+
+typedef struct cuGraphGetNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *nodes;
+    size_t *numNodes;
+} cuGraphGetNodes_params;
+
+typedef struct cuGraphGetRootNodes_params_st {
+    CUgraph hGraph;
+    CUgraphNode *rootNodes;
+    size_t *numRootNodes;
+} cuGraphGetRootNodes_params;
+
+typedef struct cuGraphGetEdges_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    size_t *numEdges;
+} cuGraphGetEdges_params;
+
+typedef struct cuGraphGetEdges_v2_params_st {
+    CUgraph hGraph;
+    CUgraphNode *from;
+    CUgraphNode *to;
+    CUgraphEdgeData *edgeData;
+    size_t *numEdges;
+} cuGraphGetEdges_v2_params;
+
+typedef struct cuGraphNodeGetDependencies_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_params;
+
+typedef struct cuGraphNodeGetDependencies_v2_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependencies;
+    CUgraphEdgeData *edgeData;
+    size_t *numDependencies;
+} cuGraphNodeGetDependencies_v2_params;
+
+typedef struct cuGraphNodeGetDependentNodes_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_params;
+
+typedef struct cuGraphNodeGetDependentNodes_v2_params_st {
+    CUgraphNode hNode;
+    CUgraphNode *dependentNodes;
+    CUgraphEdgeData *edgeData;
+    size_t *numDependentNodes;
+} cuGraphNodeGetDependentNodes_v2_params;
+
+typedef struct cuGraphAddDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphAddDependencies_params;
+
+typedef struct cuGraphAddDependencies_v2_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    const CUgraphEdgeData *edgeData;
+    size_t numDependencies;
+} cuGraphAddDependencies_v2_params;
+
+typedef struct cuGraphRemoveDependencies_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_params;
+
+typedef struct cuGraphRemoveDependencies_v2_params_st {
+    CUgraph hGraph;
+    const CUgraphNode *from;
+    const CUgraphNode *to;
+    const CUgraphEdgeData *edgeData;
+    size_t numDependencies;
+} cuGraphRemoveDependencies_v2_params;
+
+typedef struct cuGraphDestroyNode_params_st {
+    CUgraphNode hNode;
+} cuGraphDestroyNode_params;
+
+typedef struct cuGraphInstantiateWithFlags_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    unsigned long long flags;
+} cuGraphInstantiateWithFlags_params;
+
+typedef struct cuGraphInstantiateWithParams_ptsz_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams;
+} cuGraphInstantiateWithParams_ptsz_params;
+
+typedef struct cuGraphExecGetFlags_params_st {
+    CUgraphExec hGraphExec;
+    cuuint64_t *flags;
+} cuGraphExecGetFlags_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_v2_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS *nodeParams;
+} cuGraphExecKernelNodeSetParams_v2_params;
+
+typedef struct cuGraphExecMemcpyNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMCPY3D *copyParams;
+    CUcontext ctx;
+} cuGraphExecMemcpyNodeSetParams_params;
+
+typedef struct cuGraphExecMemsetNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams;
+    CUcontext ctx;
+} cuGraphExecMemsetNodeSetParams_params;
+
+typedef struct cuGraphExecHostNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_HOST_NODE_PARAMS *nodeParams;
+} cuGraphExecHostNodeSetParams_params;
+
+typedef struct cuGraphExecChildGraphNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraph childGraph;
+} cuGraphExecChildGraphNodeSetParams_params;
+
+typedef struct cuGraphExecEventRecordNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventRecordNodeSetEvent_params;
+
+typedef struct cuGraphExecEventWaitNodeSetEvent_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUevent event;
+} cuGraphExecEventWaitNodeSetEvent_params;
+
+typedef struct cuGraphExecExternalSemaphoresSignalNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresSignalNodeSetParams_params;
+
+typedef struct cuGraphExecExternalSemaphoresWaitNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams;
+} cuGraphExecExternalSemaphoresWaitNodeSetParams_params;
+
+typedef struct cuGraphNodeSetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int isEnabled;
+} cuGraphNodeSetEnabled_params;
+
+typedef struct cuGraphNodeGetEnabled_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    unsigned int *isEnabled;
+} cuGraphNodeGetEnabled_params;
+
+typedef struct cuGraphUpload_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphUpload_ptsz_params;
+
+typedef struct cuGraphLaunch_ptsz_params_st {
+    CUgraphExec hGraphExec;
+    CUstream hStream;
+} cuGraphLaunch_ptsz_params;
+
+typedef struct cuGraphExecDestroy_params_st {
+    CUgraphExec hGraphExec;
+} cuGraphExecDestroy_params;
+
+typedef struct cuGraphDestroy_params_st {
+    CUgraph hGraph;
+} cuGraphDestroy_params;
+
+typedef struct cuGraphExecUpdate_v2_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphExecUpdateResultInfo *resultInfo;
+} cuGraphExecUpdate_v2_params;
+
+typedef struct cuGraphKernelNodeCopyAttributes_params_st {
+    CUgraphNode dst;
+    CUgraphNode src;
+} cuGraphKernelNodeCopyAttributes_params;
+
+typedef struct cuGraphKernelNodeGetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    CUkernelNodeAttrValue *value_out;
+} cuGraphKernelNodeGetAttribute_params;
+
+typedef struct cuGraphKernelNodeSetAttribute_params_st {
+    CUgraphNode hNode;
+    CUkernelNodeAttrID attr;
+    const CUkernelNodeAttrValue *value;
+} cuGraphKernelNodeSetAttribute_params;
+
+typedef struct cuGraphDebugDotPrint_params_st {
+    CUgraph hGraph;
+    const char *path;
+    unsigned int flags;
+} cuGraphDebugDotPrint_params;
+
+typedef struct cuUserObjectCreate_params_st {
+    CUuserObject *object_out;
+    void *ptr;
+    CUhostFn destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cuUserObjectCreate_params;
+
+typedef struct cuUserObjectRetain_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRetain_params;
+
+typedef struct cuUserObjectRelease_params_st {
+    CUuserObject object;
+    unsigned int count;
+} cuUserObjectRelease_params;
+
+typedef struct cuGraphRetainUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+    unsigned int flags;
+} cuGraphRetainUserObject_params;
+
+typedef struct cuGraphReleaseUserObject_params_st {
+    CUgraph graph;
+    CUuserObject object;
+    unsigned int count;
+} cuGraphReleaseUserObject_params;
+
+typedef struct cuGraphAddNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    CUgraphNodeParams *nodeParams;
+} cuGraphAddNode_params;
+
+typedef struct cuGraphAddNode_v2_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUgraphNodeParams *nodeParams;
+} cuGraphAddNode_v2_params;
+
+typedef struct cuGraphNodeSetParams_params_st {
+    CUgraphNode hNode;
+    CUgraphNodeParams *nodeParams;
+} cuGraphNodeSetParams_params;
+
+typedef struct cuGraphExecNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    CUgraphNodeParams *nodeParams;
+} cuGraphExecNodeSetParams_params;
+
+typedef struct cuGraphConditionalHandleCreate_params_st {
+    CUgraphConditionalHandle *pHandle_out;
+    CUgraph hGraph;
+    CUcontext ctx;
+    unsigned int defaultLaunchValue;
+    unsigned int flags;
+} cuGraphConditionalHandleCreate_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessor_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cuOccupancyMaxActiveBlocksPerMultiprocessor_params;
+
+typedef struct cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params_st {
+    int *numBlocks;
+    CUfunction func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSize_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+} cuOccupancyMaxPotentialBlockSize_params;
+
+typedef struct cuOccupancyMaxPotentialBlockSizeWithFlags_params_st {
+    int *minGridSize;
+    int *blockSize;
+    CUfunction func;
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize;
+    size_t dynamicSMemSize;
+    int blockSizeLimit;
+    unsigned int flags;
+} cuOccupancyMaxPotentialBlockSizeWithFlags_params;
+
+typedef struct cuOccupancyAvailableDynamicSMemPerBlock_params_st {
+    size_t *dynamicSmemSize;
+    CUfunction func;
+    int numBlocks;
+    int blockSize;
+} cuOccupancyAvailableDynamicSMemPerBlock_params;
+
+typedef struct cuOccupancyMaxPotentialClusterSize_params_st {
+    int *clusterSize;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxPotentialClusterSize_params;
+
+typedef struct cuOccupancyMaxActiveClusters_params_st {
+    int *numClusters;
+    CUfunction func;
+    const CUlaunchConfig *config;
+} cuOccupancyMaxActiveClusters_params;
+
+typedef struct cuTexRefSetArray_params_st {
+    CUtexref hTexRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuTexRefSetArray_params;
+
+typedef struct cuTexRefSetMipmappedArray_params_st {
+    CUtexref hTexRef;
+    CUmipmappedArray hMipmappedArray;
+    unsigned int Flags;
+} cuTexRefSetMipmappedArray_params;
+
+typedef struct cuTexRefSetAddress_v2_params_st {
+    size_t *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr dptr;
+    size_t bytes;
+} cuTexRefSetAddress_v2_params;
+
+typedef struct cuTexRefSetAddress2D_v3_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v3_params;
+
+typedef struct cuTexRefSetFormat_params_st {
+    CUtexref hTexRef;
+    CUarray_format fmt;
+    int NumPackedComponents;
+} cuTexRefSetFormat_params;
+
+typedef struct cuTexRefSetAddressMode_params_st {
+    CUtexref hTexRef;
+    int dim;
+    CUaddress_mode am;
+} cuTexRefSetAddressMode_params;
+
+typedef struct cuTexRefSetFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetFilterMode_params;
+
+typedef struct cuTexRefSetMipmapFilterMode_params_st {
+    CUtexref hTexRef;
+    CUfilter_mode fm;
+} cuTexRefSetMipmapFilterMode_params;
+
+typedef struct cuTexRefSetMipmapLevelBias_params_st {
+    CUtexref hTexRef;
+    float bias;
+} cuTexRefSetMipmapLevelBias_params;
+
+typedef struct cuTexRefSetMipmapLevelClamp_params_st {
+    CUtexref hTexRef;
+    float minMipmapLevelClamp;
+    float maxMipmapLevelClamp;
+} cuTexRefSetMipmapLevelClamp_params;
+
+typedef struct cuTexRefSetMaxAnisotropy_params_st {
+    CUtexref hTexRef;
+    unsigned int maxAniso;
+} cuTexRefSetMaxAnisotropy_params;
+
+typedef struct cuTexRefSetBorderColor_params_st {
+    CUtexref hTexRef;
+    float *pBorderColor;
+} cuTexRefSetBorderColor_params;
+
+typedef struct cuTexRefSetFlags_params_st {
+    CUtexref hTexRef;
+    unsigned int Flags;
+} cuTexRefSetFlags_params;
+
+typedef struct cuTexRefGetAddress_v2_params_st {
+    CUdeviceptr *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_v2_params;
+
+typedef struct cuTexRefGetArray_params_st {
+    CUarray *phArray;
+    CUtexref hTexRef;
+} cuTexRefGetArray_params;
+
+typedef struct cuTexRefGetMipmappedArray_params_st {
+    CUmipmappedArray *phMipmappedArray;
+    CUtexref hTexRef;
+} cuTexRefGetMipmappedArray_params;
+
+typedef struct cuTexRefGetAddressMode_params_st {
+    CUaddress_mode *pam;
+    CUtexref hTexRef;
+    int dim;
+} cuTexRefGetAddressMode_params;
+
+typedef struct cuTexRefGetFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetFilterMode_params;
+
+typedef struct cuTexRefGetFormat_params_st {
+    CUarray_format *pFormat;
+    int *pNumChannels;
+    CUtexref hTexRef;
+} cuTexRefGetFormat_params;
+
+typedef struct cuTexRefGetMipmapFilterMode_params_st {
+    CUfilter_mode *pfm;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapFilterMode_params;
+
+typedef struct cuTexRefGetMipmapLevelBias_params_st {
+    float *pbias;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelBias_params;
+
+typedef struct cuTexRefGetMipmapLevelClamp_params_st {
+    float *pminMipmapLevelClamp;
+    float *pmaxMipmapLevelClamp;
+    CUtexref hTexRef;
+} cuTexRefGetMipmapLevelClamp_params;
+
+typedef struct cuTexRefGetMaxAnisotropy_params_st {
+    int *pmaxAniso;
+    CUtexref hTexRef;
+} cuTexRefGetMaxAnisotropy_params;
+
+typedef struct cuTexRefGetBorderColor_params_st {
+    float *pBorderColor;
+    CUtexref hTexRef;
+} cuTexRefGetBorderColor_params;
+
+typedef struct cuTexRefGetFlags_params_st {
+    unsigned int *pFlags;
+    CUtexref hTexRef;
+} cuTexRefGetFlags_params;
+
+typedef struct cuTexRefCreate_params_st {
+    CUtexref *pTexRef;
+} cuTexRefCreate_params;
+
+typedef struct cuTexRefDestroy_params_st {
+    CUtexref hTexRef;
+} cuTexRefDestroy_params;
+
+typedef struct cuSurfRefSetArray_params_st {
+    CUsurfref hSurfRef;
+    CUarray hArray;
+    unsigned int Flags;
+} cuSurfRefSetArray_params;
+
+typedef struct cuSurfRefGetArray_params_st {
+    CUarray *phArray;
+    CUsurfref hSurfRef;
+} cuSurfRefGetArray_params;
+
+typedef struct cuTexObjectCreate_params_st {
+    CUtexObject *pTexObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+    const CUDA_TEXTURE_DESC *pTexDesc;
+    const CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+} cuTexObjectCreate_params;
+
+typedef struct cuTexObjectDestroy_params_st {
+    CUtexObject texObject;
+} cuTexObjectDestroy_params;
+
+typedef struct cuTexObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceDesc_params;
+
+typedef struct cuTexObjectGetTextureDesc_params_st {
+    CUDA_TEXTURE_DESC *pTexDesc;
+    CUtexObject texObject;
+} cuTexObjectGetTextureDesc_params;
+
+typedef struct cuTexObjectGetResourceViewDesc_params_st {
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc;
+    CUtexObject texObject;
+} cuTexObjectGetResourceViewDesc_params;
+
+typedef struct cuSurfObjectCreate_params_st {
+    CUsurfObject *pSurfObject;
+    const CUDA_RESOURCE_DESC *pResDesc;
+} cuSurfObjectCreate_params;
+
+typedef struct cuSurfObjectDestroy_params_st {
+    CUsurfObject surfObject;
+} cuSurfObjectDestroy_params;
+
+typedef struct cuSurfObjectGetResourceDesc_params_st {
+    CUDA_RESOURCE_DESC *pResDesc;
+    CUsurfObject surfObject;
+} cuSurfObjectGetResourceDesc_params;
+
+typedef struct cuTensorMapEncodeTiled_params_st {
+    CUtensorMap *tensorMap;
+    CUtensorMapDataType tensorDataType;
+    cuuint32_t tensorRank;
+    void *globalAddress;
+    const cuuint64_t *globalDim;
+    const cuuint64_t *globalStrides;
+    const cuuint32_t *boxDim;
+    const cuuint32_t *elementStrides;
+    CUtensorMapInterleave interleave;
+    CUtensorMapSwizzle swizzle;
+    CUtensorMapL2promotion l2Promotion;
+    CUtensorMapFloatOOBfill oobFill;
+} cuTensorMapEncodeTiled_params;
+
+typedef struct cuTensorMapEncodeIm2col_params_st {
+    CUtensorMap *tensorMap;
+    CUtensorMapDataType tensorDataType;
+    cuuint32_t tensorRank;
+    void *globalAddress;
+    const cuuint64_t *globalDim;
+    const cuuint64_t *globalStrides;
+    const int *pixelBoxLowerCorner;
+    const int *pixelBoxUpperCorner;
+    cuuint32_t channelsPerPixel;
+    cuuint32_t pixelsPerColumn;
+    const cuuint32_t *elementStrides;
+    CUtensorMapInterleave interleave;
+    CUtensorMapSwizzle swizzle;
+    CUtensorMapL2promotion l2Promotion;
+    CUtensorMapFloatOOBfill oobFill;
+} cuTensorMapEncodeIm2col_params;
+
+typedef struct cuTensorMapReplaceAddress_params_st {
+    CUtensorMap *tensorMap;
+    void *globalAddress;
+} cuTensorMapReplaceAddress_params;
+
+typedef struct cuDeviceCanAccessPeer_params_st {
+    int *canAccessPeer;
+    CUdevice dev;
+    CUdevice peerDev;
+} cuDeviceCanAccessPeer_params;
+
+typedef struct cuCtxEnablePeerAccess_params_st {
+    CUcontext peerContext;
+    unsigned int Flags;
+} cuCtxEnablePeerAccess_params;
+
+typedef struct cuCtxDisablePeerAccess_params_st {
+    CUcontext peerContext;
+} cuCtxDisablePeerAccess_params;
+
+typedef struct cuDeviceGetP2PAttribute_params_st {
+    int *value;
+    CUdevice_P2PAttribute attrib;
+    CUdevice srcDevice;
+    CUdevice dstDevice;
+} cuDeviceGetP2PAttribute_params;
+
+typedef struct cuGraphicsUnregisterResource_params_st {
+    CUgraphicsResource resource;
+} cuGraphicsUnregisterResource_params;
+
+typedef struct cuGraphicsSubResourceGetMappedArray_params_st {
+    CUarray *pArray;
+    CUgraphicsResource resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cuGraphicsSubResourceGetMappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedMipmappedArray_params_st {
+    CUmipmappedArray *pMipmappedArray;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedMipmappedArray_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_v2_params_st {
+    CUdeviceptr *pDevPtr;
+    size_t *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_v2_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_v2_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_v2_params;
+
+typedef struct cuGraphicsMapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_ptsz_params;
+
+typedef struct cuGraphicsUnmapResources_ptsz_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_ptsz_params;
+
+typedef struct cuGetProcAddress_v2_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+    CUdriverProcAddressQueryResult *symbolStatus;
+} cuGetProcAddress_v2_params;
+
+typedef struct cuCoredumpGetAttribute_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpGetAttribute_params;
+
+typedef struct cuCoredumpGetAttributeGlobal_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpGetAttributeGlobal_params;
+
+typedef struct cuCoredumpSetAttribute_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpSetAttribute_params;
+
+typedef struct cuCoredumpSetAttributeGlobal_params_st {
+    CUcoredumpSettings attrib;
+    void *value;
+    size_t *size;
+} cuCoredumpSetAttributeGlobal_params;
+
+typedef struct cuGetExportTable_params_st {
+    const void **ppExportTable;
+    const CUuuid *pExportTableId;
+} cuGetExportTable_params;
+
+typedef struct cuGreenCtxCreate_params_st {
+    CUgreenCtx *phCtx;
+    CUdevResourceDesc desc;
+    CUdevice dev;
+    unsigned int flags;
+} cuGreenCtxCreate_params;
+
+typedef struct cuGreenCtxDestroy_params_st {
+    CUgreenCtx hCtx;
+} cuGreenCtxDestroy_params;
+
+typedef struct cuCtxFromGreenCtx_params_st {
+    CUcontext *pContext;
+    CUgreenCtx hCtx;
+} cuCtxFromGreenCtx_params;
+
+typedef struct cuDeviceGetDevResource_params_st {
+    CUdevice device;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuDeviceGetDevResource_params;
+
+typedef struct cuCtxGetDevResource_params_st {
+    CUcontext hCtx;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuCtxGetDevResource_params;
+
+typedef struct cuGreenCtxGetDevResource_params_st {
+    CUgreenCtx hCtx;
+    CUdevResource *resource;
+    CUdevResourceType type;
+} cuGreenCtxGetDevResource_params;
+
+typedef struct cuDevSmResourceSplitByCount_params_st {
+    CUdevResource *result;
+    unsigned int *nbGroups;
+    const CUdevResource *input;
+    CUdevResource *remaining;
+    unsigned int useFlags;
+    unsigned int minCount;
+} cuDevSmResourceSplitByCount_params;
+
+typedef struct cuDevResourceGenerateDesc_params_st {
+    CUdevResourceDesc *phDesc;
+    CUdevResource *resources;
+    unsigned int nbResources;
+} cuDevResourceGenerateDesc_params;
+
+typedef struct cuGreenCtxRecordEvent_params_st {
+    CUgreenCtx hCtx;
+    CUevent hEvent;
+} cuGreenCtxRecordEvent_params;
+
+typedef struct cuGreenCtxWaitEvent_params_st {
+    CUgreenCtx hCtx;
+    CUevent hEvent;
+} cuGreenCtxWaitEvent_params;
+
+typedef struct cuStreamGetGreenCtx_params_st {
+    CUstream hStream;
+    CUgreenCtx *phCtx;
+} cuStreamGetGreenCtx_params;
+
+typedef struct cuMemHostRegister_params_st {
+    void *p;
+    size_t bytesize;
+    unsigned int Flags;
+} cuMemHostRegister_params;
+
+typedef struct cuGraphicsResourceSetMapFlags_params_st {
+    CUgraphicsResource resource;
+    unsigned int flags;
+} cuGraphicsResourceSetMapFlags_params;
+
+typedef struct cuLinkCreate_params_st {
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+    CUlinkState *stateOut;
+} cuLinkCreate_params;
+
+typedef struct cuLinkAddData_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    void *data;
+    size_t size;
+    const char *name;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddData_params;
+
+typedef struct cuLinkAddFile_params_st {
+    CUlinkState state;
+    CUjitInputType type;
+    const char *path;
+    unsigned int numOptions;
+    CUjit_option *options;
+    void **optionValues;
+} cuLinkAddFile_params;
+
+typedef struct cuTexRefSetAddress2D_v2_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR *desc;
+    CUdeviceptr dptr;
+    size_t Pitch;
+} cuTexRefSetAddress2D_v2_params;
+
+typedef struct cuDeviceTotalMem_params_st {
+    unsigned int *bytes;
+    CUdevice dev;
+} cuDeviceTotalMem_params;
+
+typedef struct cuCtxCreate_params_st {
+    CUcontext *pctx;
+    unsigned int flags;
+    CUdevice dev;
+} cuCtxCreate_params;
+
+typedef struct cuModuleGetGlobal_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *bytes;
+    CUmodule hmod;
+    const char *name;
+} cuModuleGetGlobal_params;
+
+typedef struct cuMemGetInfo_params_st {
+    unsigned int *free;
+    unsigned int *total;
+} cuMemGetInfo_params;
+
+typedef struct cuMemAlloc_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int bytesize;
+} cuMemAlloc_params;
+
+typedef struct cuMemAllocPitch_params_st {
+    CUdeviceptr_v1 *dptr;
+    unsigned int *pPitch;
+    unsigned int WidthInBytes;
+    unsigned int Height;
+    unsigned int ElementSizeBytes;
+} cuMemAllocPitch_params;
+
+typedef struct cuMemFree_params_st {
+    CUdeviceptr_v1 dptr;
+} cuMemFree_params;
+
+typedef struct cuMemGetAddressRange_params_st {
+    CUdeviceptr_v1 *pbase;
+    unsigned int *psize;
+    CUdeviceptr_v1 dptr;
+} cuMemGetAddressRange_params;
+
+typedef struct cuMemAllocHost_params_st {
+    void **pp;
+    unsigned int bytesize;
+} cuMemAllocHost_params;
+
+typedef struct cuMemHostGetDevicePointer_params_st {
+    CUdeviceptr_v1 *pdptr;
+    void *p;
+    unsigned int Flags;
+} cuMemHostGetDevicePointer_params;
+
+typedef struct cuMemcpyHtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoD_params;
+
+typedef struct cuMemcpyDtoH_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoH_params;
+
+typedef struct cuMemcpyDtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoD_params;
+
+typedef struct cuMemcpyDtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+} cuMemcpyDtoA_params;
+
+typedef struct cuMemcpyAtoD_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoD_params;
+
+typedef struct cuMemcpyHtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+} cuMemcpyHtoA_params;
+
+typedef struct cuMemcpyAtoH_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoH_params;
+
+typedef struct cuMemcpyAtoA_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+} cuMemcpyAtoA_params;
+
+typedef struct cuMemcpyHtoAAsync_params_st {
+    CUarray dstArray;
+    unsigned int dstOffset;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_params;
+
+typedef struct cuMemcpyAtoHAsync_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    unsigned int srcOffset;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_params;
+
+typedef struct cuMemcpy2D_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2D_params;
+
+typedef struct cuMemcpy2DUnaligned_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+} cuMemcpy2DUnaligned_params;
+
+typedef struct cuMemcpy3D_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+} cuMemcpy3D_params;
+
+typedef struct cuMemcpyHtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    const void *srcHost;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_params;
+
+typedef struct cuMemcpyDtoHAsync_params_st {
+    void *dstHost;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_params;
+
+typedef struct cuMemcpyDtoDAsync_params_st {
+    CUdeviceptr_v1 dstDevice;
+    CUdeviceptr_v1 srcDevice;
+    unsigned int ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_params;
+
+typedef struct cuMemcpy2DAsync_params_st {
+    const CUDA_MEMCPY2D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_params;
+
+typedef struct cuMemcpy3DAsync_params_st {
+    const CUDA_MEMCPY3D_v1 *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_params;
+
+typedef struct cuMemsetD8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned char uc;
+    unsigned int N;
+} cuMemsetD8_params;
+
+typedef struct cuMemsetD16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned short us;
+    unsigned int N;
+} cuMemsetD16_params;
+
+typedef struct cuMemsetD32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int ui;
+    unsigned int N;
+} cuMemsetD32_params;
+
+typedef struct cuMemsetD2D8_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned char uc;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D8_params;
+
+typedef struct cuMemsetD2D16_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned short us;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D16_params;
+
+typedef struct cuMemsetD2D32_params_st {
+    CUdeviceptr_v1 dstDevice;
+    unsigned int dstPitch;
+    unsigned int ui;
+    unsigned int Width;
+    unsigned int Height;
+} cuMemsetD2D32_params;
+
+typedef struct cuArrayCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray;
+} cuArrayCreate_params;
+
+typedef struct cuArrayGetDescriptor_params_st {
+    CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArrayGetDescriptor_params;
+
+typedef struct cuArray3DCreate_params_st {
+    CUarray *pHandle;
+    const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray;
+} cuArray3DCreate_params;
+
+typedef struct cuArray3DGetDescriptor_params_st {
+    CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor;
+    CUarray hArray;
+} cuArray3DGetDescriptor_params;
+
+typedef struct cuTexRefSetAddress_params_st {
+    unsigned int *ByteOffset;
+    CUtexref hTexRef;
+    CUdeviceptr_v1 dptr;
+    unsigned int bytes;
+} cuTexRefSetAddress_params;
+
+typedef struct cuTexRefSetAddress2D_params_st {
+    CUtexref hTexRef;
+    const CUDA_ARRAY_DESCRIPTOR_v1 *desc;
+    CUdeviceptr_v1 dptr;
+    unsigned int Pitch;
+} cuTexRefSetAddress2D_params;
+
+typedef struct cuTexRefGetAddress_params_st {
+    CUdeviceptr_v1 *pdptr;
+    CUtexref hTexRef;
+} cuTexRefGetAddress_params;
+
+typedef struct cuGraphicsResourceGetMappedPointer_params_st {
+    CUdeviceptr_v1 *pDevPtr;
+    unsigned int *pSize;
+    CUgraphicsResource resource;
+} cuGraphicsResourceGetMappedPointer_params;
+
+typedef struct cuCtxDestroy_params_st {
+    CUcontext ctx;
+} cuCtxDestroy_params;
+
+typedef struct cuCtxPopCurrent_params_st {
+    CUcontext *pctx;
+} cuCtxPopCurrent_params;
+
+typedef struct cuCtxPushCurrent_params_st {
+    CUcontext ctx;
+} cuCtxPushCurrent_params;
+
+typedef struct cuStreamDestroy_params_st {
+    CUstream hStream;
+} cuStreamDestroy_params;
+
+typedef struct cuEventDestroy_params_st {
+    CUevent hEvent;
+} cuEventDestroy_params;
+
+typedef struct cuDevicePrimaryCtxRelease_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxRelease_params;
+
+typedef struct cuDevicePrimaryCtxReset_params_st {
+    CUdevice dev;
+} cuDevicePrimaryCtxReset_params;
+
+typedef struct cuDevicePrimaryCtxSetFlags_params_st {
+    CUdevice dev;
+    unsigned int flags;
+} cuDevicePrimaryCtxSetFlags_params;
+
+typedef struct cuMemcpyHtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoD_v2_params;
+
+typedef struct cuMemcpyDtoH_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoH_v2_params;
+
+typedef struct cuMemcpyDtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoD_v2_params;
+
+typedef struct cuMemcpyDtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+} cuMemcpyDtoA_v2_params;
+
+typedef struct cuMemcpyAtoD_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoD_v2_params;
+
+typedef struct cuMemcpyHtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+} cuMemcpyHtoA_v2_params;
+
+typedef struct cuMemcpyAtoH_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoH_v2_params;
+
+typedef struct cuMemcpyAtoA_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+} cuMemcpyAtoA_v2_params;
+
+typedef struct cuMemcpyHtoAAsync_v2_params_st {
+    CUarray dstArray;
+    size_t dstOffset;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoAAsync_v2_params;
+
+typedef struct cuMemcpyAtoHAsync_v2_params_st {
+    void *dstHost;
+    CUarray srcArray;
+    size_t srcOffset;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAtoHAsync_v2_params;
+
+typedef struct cuMemcpy2D_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2D_v2_params;
+
+typedef struct cuMemcpy2DUnaligned_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+} cuMemcpy2DUnaligned_v2_params;
+
+typedef struct cuMemcpy3D_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+} cuMemcpy3D_v2_params;
+
+typedef struct cuMemcpyHtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    const void *srcHost;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyHtoDAsync_v2_params;
+
+typedef struct cuMemcpyDtoHAsync_v2_params_st {
+    void *dstHost;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoHAsync_v2_params;
+
+typedef struct cuMemcpyDtoDAsync_v2_params_st {
+    CUdeviceptr dstDevice;
+    CUdeviceptr srcDevice;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyDtoDAsync_v2_params;
+
+typedef struct cuMemcpy2DAsync_v2_params_st {
+    const CUDA_MEMCPY2D *pCopy;
+    CUstream hStream;
+} cuMemcpy2DAsync_v2_params;
+
+typedef struct cuMemcpy3DAsync_v2_params_st {
+    const CUDA_MEMCPY3D *pCopy;
+    CUstream hStream;
+} cuMemcpy3DAsync_v2_params;
+
+typedef struct cuMemsetD8_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+} cuMemsetD8_v2_params;
+
+typedef struct cuMemsetD16_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+} cuMemsetD16_v2_params;
+
+typedef struct cuMemsetD32_v2_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+} cuMemsetD32_v2_params;
+
+typedef struct cuMemsetD2D8_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D8_v2_params;
+
+typedef struct cuMemsetD2D16_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D16_v2_params;
+
+typedef struct cuMemsetD2D32_v2_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+} cuMemsetD2D32_v2_params;
+
+typedef struct cuMemcpy_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+} cuMemcpy_params;
+
+typedef struct cuMemcpyAsync_params_st {
+    CUdeviceptr dst;
+    CUdeviceptr src;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyAsync_params;
+
+typedef struct cuMemcpyPeer_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+} cuMemcpyPeer_params;
+
+typedef struct cuMemcpyPeerAsync_params_st {
+    CUdeviceptr dstDevice;
+    CUcontext dstContext;
+    CUdeviceptr srcDevice;
+    CUcontext srcContext;
+    size_t ByteCount;
+    CUstream hStream;
+} cuMemcpyPeerAsync_params;
+
+typedef struct cuMemcpy3DPeer_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+} cuMemcpy3DPeer_params;
+
+typedef struct cuMemcpy3DPeerAsync_params_st {
+    const CUDA_MEMCPY3D_PEER *pCopy;
+    CUstream hStream;
+} cuMemcpy3DPeerAsync_params;
+
+typedef struct cuMemsetD8Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned char uc;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD8Async_params;
+
+typedef struct cuMemsetD16Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned short us;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD16Async_params;
+
+typedef struct cuMemsetD32Async_params_st {
+    CUdeviceptr dstDevice;
+    unsigned int ui;
+    size_t N;
+    CUstream hStream;
+} cuMemsetD32Async_params;
+
+typedef struct cuMemsetD2D8Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned char uc;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D8Async_params;
+
+typedef struct cuMemsetD2D16Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned short us;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D16Async_params;
+
+typedef struct cuMemsetD2D32Async_params_st {
+    CUdeviceptr dstDevice;
+    size_t dstPitch;
+    unsigned int ui;
+    size_t Width;
+    size_t Height;
+    CUstream hStream;
+} cuMemsetD2D32Async_params;
+
+typedef struct cuStreamGetPriority_params_st {
+    CUstream hStream;
+    int *priority;
+} cuStreamGetPriority_params;
+
+typedef struct cuStreamGetId_params_st {
+    CUstream hStream;
+    unsigned long long *streamId;
+} cuStreamGetId_params;
+
+typedef struct cuStreamGetFlags_params_st {
+    CUstream hStream;
+    unsigned int *flags;
+} cuStreamGetFlags_params;
+
+typedef struct cuStreamGetCtx_params_st {
+    CUstream hStream;
+    CUcontext *pctx;
+} cuStreamGetCtx_params;
+
+typedef struct cuStreamWaitEvent_params_st {
+    CUstream hStream;
+    CUevent hEvent;
+    unsigned int Flags;
+} cuStreamWaitEvent_params;
+
+typedef struct cuStreamAddCallback_params_st {
+    CUstream hStream;
+    CUstreamCallback callback;
+    void *userData;
+    unsigned int flags;
+} cuStreamAddCallback_params;
+
+typedef struct cuStreamAttachMemAsync_params_st {
+    CUstream hStream;
+    CUdeviceptr dptr;
+    size_t length;
+    unsigned int flags;
+} cuStreamAttachMemAsync_params;
+
+typedef struct cuStreamQuery_params_st {
+    CUstream hStream;
+} cuStreamQuery_params;
+
+typedef struct cuStreamSynchronize_params_st {
+    CUstream hStream;
+} cuStreamSynchronize_params;
+
+typedef struct cuEventRecord_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+} cuEventRecord_params;
+
+typedef struct cuEventRecordWithFlags_params_st {
+    CUevent hEvent;
+    CUstream hStream;
+    unsigned int flags;
+} cuEventRecordWithFlags_params;
+
+typedef struct cuLaunchKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernel_params;
+
+typedef struct cuLaunchKernelEx_params_st {
+    const CUlaunchConfig *config;
+    CUfunction f;
+    void **kernelParams;
+    void **extra;
+} cuLaunchKernelEx_params;
+
+typedef struct cuLaunchHostFunc_params_st {
+    CUstream hStream;
+    CUhostFn fn;
+    void *userData;
+} cuLaunchHostFunc_params;
+
+typedef struct cuGraphicsMapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsMapResources_params;
+
+typedef struct cuGraphicsUnmapResources_params_st {
+    unsigned int count;
+    CUgraphicsResource *resources;
+    CUstream hStream;
+} cuGraphicsUnmapResources_params;
+
+typedef struct cuStreamWriteValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_params;
+
+typedef struct cuStreamWaitValue32_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_params;
+
+typedef struct cuStreamWriteValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_params;
+
+typedef struct cuStreamWaitValue64_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_params;
+
+typedef struct cuStreamBatchMemOp_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_params;
+
+typedef struct cuStreamWriteValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_ptsz_params;
+
+typedef struct cuStreamWaitValue32_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_ptsz_params;
+
+typedef struct cuStreamWriteValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_ptsz_params;
+
+typedef struct cuStreamWaitValue64_ptsz_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_ptsz_params;
+
+typedef struct cuStreamBatchMemOp_ptsz_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_ptsz_params;
+
+typedef struct cuStreamWriteValue32_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWriteValue32_v2_params;
+
+typedef struct cuStreamWaitValue32_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint32_t value;
+    unsigned int flags;
+} cuStreamWaitValue32_v2_params;
+
+typedef struct cuStreamWriteValue64_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWriteValue64_v2_params;
+
+typedef struct cuStreamWaitValue64_v2_params_st {
+    CUstream stream;
+    CUdeviceptr addr;
+    cuuint64_t value;
+    unsigned int flags;
+} cuStreamWaitValue64_v2_params;
+
+typedef struct cuStreamBatchMemOp_v2_params_st {
+    CUstream stream;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} cuStreamBatchMemOp_v2_params;
+
+typedef struct cuMemPrefetchAsync_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUdevice dstDevice;
+    CUstream hStream;
+} cuMemPrefetchAsync_params;
+
+typedef struct cuMemPrefetchAsync_v2_params_st {
+    CUdeviceptr devPtr;
+    size_t count;
+    CUmemLocation location;
+    unsigned int flags;
+    CUstream hStream;
+} cuMemPrefetchAsync_v2_params;
+
+typedef struct cuLaunchCooperativeKernel_params_st {
+    CUfunction f;
+    unsigned int gridDimX;
+    unsigned int gridDimY;
+    unsigned int gridDimZ;
+    unsigned int blockDimX;
+    unsigned int blockDimY;
+    unsigned int blockDimZ;
+    unsigned int sharedMemBytes;
+    CUstream hStream;
+    void **kernelParams;
+} cuLaunchCooperativeKernel_params;
+
+typedef struct cuSignalExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuSignalExternalSemaphoresAsync_params;
+
+typedef struct cuWaitExternalSemaphoresAsync_params_st {
+    const CUexternalSemaphore *extSemArray;
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray;
+    unsigned int numExtSems;
+    CUstream stream;
+} cuWaitExternalSemaphoresAsync_params;
+
+typedef struct cuStreamBeginCapture_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_params;
+
+typedef struct cuStreamBeginCapture_ptsz_params_st {
+    CUstream hStream;
+} cuStreamBeginCapture_ptsz_params;
+
+typedef struct cuStreamBeginCapture_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCapture_v2_params;
+
+typedef struct cuStreamBeginCaptureToGraph_params_st {
+    CUstream hStream;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    CUstreamCaptureMode mode;
+} cuStreamBeginCaptureToGraph_params;
+
+typedef struct cuStreamEndCapture_params_st {
+    CUstream hStream;
+    CUgraph *phGraph;
+} cuStreamEndCapture_params;
+
+typedef struct cuStreamIsCapturing_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus;
+} cuStreamIsCapturing_params;
+
+typedef struct cuStreamGetCaptureInfo_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_params;
+
+typedef struct cuStreamGetCaptureInfo_ptsz_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+} cuStreamGetCaptureInfo_ptsz_params;
+
+typedef struct cuStreamGetCaptureInfo_v2_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v2_params;
+
+typedef struct cuStreamGetCaptureInfo_v3_params_st {
+    CUstream hStream;
+    CUstreamCaptureStatus *captureStatus_out;
+    cuuint64_t *id_out;
+    CUgraph *graph_out;
+    const CUgraphNode **dependencies_out;
+    const CUgraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cuStreamGetCaptureInfo_v3_params;
+
+typedef struct cuGraphAddKernelNode_params_st {
+    CUgraphNode *phGraphNode;
+    CUgraph hGraph;
+    const CUgraphNode *dependencies;
+    size_t numDependencies;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphAddKernelNode_params;
+
+typedef struct cuGraphKernelNodeGetParams_params_st {
+    CUgraphNode hNode;
+    CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphKernelNodeGetParams_params;
+
+typedef struct cuGraphKernelNodeSetParams_params_st {
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphKernelNodeSetParams_params;
+
+typedef struct cuGraphExecKernelNodeSetParams_params_st {
+    CUgraphExec hGraphExec;
+    CUgraphNode hNode;
+    const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams;
+} cuGraphExecKernelNodeSetParams_params;
+
+typedef struct cuGraphInstantiateWithParams_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUDA_GRAPH_INSTANTIATE_PARAMS *instantiateParams;
+} cuGraphInstantiateWithParams_params;
+
+typedef struct cuGraphExecUpdate_params_st {
+    CUgraphExec hGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *hErrorNode_out;
+    CUgraphExecUpdateResult *updateResult_out;
+} cuGraphExecUpdate_params;
+
+typedef struct cuGraphUpload_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphUpload_params;
+
+typedef struct cuGraphLaunch_params_st {
+    CUgraphExec hGraph;
+    CUstream hStream;
+} cuGraphLaunch_params;
+
+typedef struct cuStreamCopyAttributes_params_st {
+    CUstream dstStream;
+    CUstream srcStream;
+} cuStreamCopyAttributes_params;
+
+typedef struct cuStreamGetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    CUstreamAttrValue *value;
+} cuStreamGetAttribute_params;
+
+typedef struct cuStreamSetAttribute_params_st {
+    CUstream hStream;
+    CUstreamAttrID attr;
+    const CUstreamAttrValue *param;
+} cuStreamSetAttribute_params;
+
+typedef struct cuIpcOpenMemHandle_params_st {
+    CUdeviceptr *pdptr;
+    CUipcMemHandle handle;
+    unsigned int Flags;
+} cuIpcOpenMemHandle_params;
+
+typedef struct cuGraphInstantiate_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_params;
+
+typedef struct cuGraphInstantiate_v2_params_st {
+    CUgraphExec *phGraphExec;
+    CUgraph hGraph;
+    CUgraphNode *phErrorNode;
+    char *logBuffer;
+    size_t bufferSize;
+} cuGraphInstantiate_v2_params;
+
+typedef struct cuMemMapArrayAsync_params_st {
+    CUarrayMapInfo *mapInfoList;
+    unsigned int count;
+    CUstream hStream;
+} cuMemMapArrayAsync_params;
+
+typedef struct cuMemFreeAsync_params_st {
+    CUdeviceptr dptr;
+    CUstream hStream;
+} cuMemFreeAsync_params;
+
+typedef struct cuMemAllocAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUstream hStream;
+} cuMemAllocAsync_params;
+
+typedef struct cuMemAllocFromPoolAsync_params_st {
+    CUdeviceptr *dptr;
+    size_t bytesize;
+    CUmemoryPool pool;
+    CUstream hStream;
+} cuMemAllocFromPoolAsync_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_params;
+
+typedef struct cuStreamUpdateCaptureDependencies_v2_params_st {
+    CUstream hStream;
+    CUgraphNode *dependencies;
+    const CUgraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cuStreamUpdateCaptureDependencies_v2_params;
+
+typedef struct cuGetProcAddress_params_st {
+    const char *symbol;
+    void **pfn;
+    int cudaVersion;
+    cuuint64_t flags;
+} cuGetProcAddress_params;
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..4476ca9e13cb2e18cb1d634f2979e5df561d0594
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_runtime_api_meta.h
@@ -0,0 +1,2288 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_runtime_api.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaDeviceSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaDeviceSetLimit_v3020_params;
+
+typedef struct cudaDeviceGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaDeviceGetLimit_v3020_params;
+
+typedef struct cudaDeviceGetTexture1DLinearMaxWidth_v11010_params_st {
+    size_t *maxWidthInElements;
+    const struct cudaChannelFormatDesc *fmtDesc;
+    int device;
+} cudaDeviceGetTexture1DLinearMaxWidth_v11010_params;
+
+typedef struct cudaDeviceGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaDeviceGetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetStreamPriorityRange_v5050_params_st {
+    int *leastPriority;
+    int *greatestPriority;
+} cudaDeviceGetStreamPriorityRange_v5050_params;
+
+typedef struct cudaDeviceSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaDeviceSetCacheConfig_v3020_params;
+
+typedef struct cudaDeviceGetByPCIBusId_v4010_params_st {
+    int *device;
+    const char *pciBusId;
+} cudaDeviceGetByPCIBusId_v4010_params;
+
+typedef struct cudaDeviceGetPCIBusId_v4010_params_st {
+    char *pciBusId;
+    int len;
+    int device;
+} cudaDeviceGetPCIBusId_v4010_params;
+
+typedef struct cudaIpcGetEventHandle_v4010_params_st {
+    cudaIpcEventHandle_t *handle;
+    cudaEvent_t event;
+} cudaIpcGetEventHandle_v4010_params;
+
+typedef struct cudaIpcOpenEventHandle_v4010_params_st {
+    cudaEvent_t *event;
+    cudaIpcEventHandle_t handle;
+} cudaIpcOpenEventHandle_v4010_params;
+
+typedef struct cudaIpcGetMemHandle_v4010_params_st {
+    cudaIpcMemHandle_t *handle;
+    void *devPtr;
+} cudaIpcGetMemHandle_v4010_params;
+
+typedef struct cudaIpcOpenMemHandle_v4010_params_st {
+    void **devPtr;
+    cudaIpcMemHandle_t handle;
+    unsigned int flags;
+} cudaIpcOpenMemHandle_v4010_params;
+
+typedef struct cudaIpcCloseMemHandle_v4010_params_st {
+    void *devPtr;
+} cudaIpcCloseMemHandle_v4010_params;
+
+typedef struct cudaDeviceFlushGPUDirectRDMAWrites_v11030_params_st {
+    enum cudaFlushGPUDirectRDMAWritesTarget target;
+    enum cudaFlushGPUDirectRDMAWritesScope scope;
+} cudaDeviceFlushGPUDirectRDMAWrites_v11030_params;
+
+typedef struct cudaDeviceGetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig *pConfig;
+} cudaDeviceGetSharedMemConfig_v4020_params;
+
+typedef struct cudaDeviceSetSharedMemConfig_v4020_params_st {
+    enum cudaSharedMemConfig config;
+} cudaDeviceSetSharedMemConfig_v4020_params;
+
+typedef struct cudaGetErrorName_v6050_params_st {
+    cudaError_t error;
+} cudaGetErrorName_v6050_params;
+
+typedef struct cudaGetErrorString_v3020_params_st {
+    cudaError_t error;
+} cudaGetErrorString_v3020_params;
+
+typedef struct cudaGetDeviceCount_v3020_params_st {
+    int *count;
+} cudaGetDeviceCount_v3020_params;
+
+typedef struct cudaGetDeviceProperties_v2_v12000_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v2_v12000_params;
+
+typedef struct cudaDeviceGetAttribute_v5000_params_st {
+    int *value;
+    enum cudaDeviceAttr attr;
+    int device;
+} cudaDeviceGetAttribute_v5000_params;
+
+typedef struct cudaDeviceGetDefaultMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetDefaultMemPool_v11020_params;
+
+typedef struct cudaDeviceSetMemPool_v11020_params_st {
+    int device;
+    cudaMemPool_t memPool;
+} cudaDeviceSetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetMemPool_v11020_params_st {
+    cudaMemPool_t *memPool;
+    int device;
+} cudaDeviceGetMemPool_v11020_params;
+
+typedef struct cudaDeviceGetNvSciSyncAttributes_v10020_params_st {
+    void *nvSciSyncAttrList;
+    int device;
+    int flags;
+} cudaDeviceGetNvSciSyncAttributes_v10020_params;
+
+typedef struct cudaDeviceGetP2PAttribute_v8000_params_st {
+    int *value;
+    enum cudaDeviceP2PAttr attr;
+    int srcDevice;
+    int dstDevice;
+} cudaDeviceGetP2PAttribute_v8000_params;
+
+typedef struct cudaChooseDevice_v3020_params_st {
+    int *device;
+    const struct cudaDeviceProp *prop;
+} cudaChooseDevice_v3020_params;
+
+typedef struct cudaInitDevice_v12000_params_st {
+    int device;
+    unsigned int deviceFlags;
+    unsigned int flags;
+} cudaInitDevice_v12000_params;
+
+typedef struct cudaSetDevice_v3020_params_st {
+    int device;
+} cudaSetDevice_v3020_params;
+
+typedef struct cudaGetDevice_v3020_params_st {
+    int *device;
+} cudaGetDevice_v3020_params;
+
+typedef struct cudaSetValidDevices_v3020_params_st {
+    int *device_arr;
+    int len;
+} cudaSetValidDevices_v3020_params;
+
+typedef struct cudaSetDeviceFlags_v3020_params_st {
+    unsigned int flags;
+} cudaSetDeviceFlags_v3020_params;
+
+typedef struct cudaGetDeviceFlags_v7000_params_st {
+    unsigned int *flags;
+} cudaGetDeviceFlags_v7000_params;
+
+typedef struct cudaStreamCreate_v3020_params_st {
+    cudaStream_t *pStream;
+} cudaStreamCreate_v3020_params;
+
+typedef struct cudaStreamCreateWithFlags_v5000_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+} cudaStreamCreateWithFlags_v5000_params;
+
+typedef struct cudaStreamCreateWithPriority_v5050_params_st {
+    cudaStream_t *pStream;
+    unsigned int flags;
+    int priority;
+} cudaStreamCreateWithPriority_v5050_params;
+
+typedef struct cudaStreamGetPriority_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_ptsz_v7000_params;
+
+typedef struct cudaStreamGetFlags_ptsz_v7000_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_ptsz_v7000_params;
+
+typedef struct cudaStreamGetId_ptsz_v12000_params_st {
+    cudaStream_t hStream;
+    unsigned long long *streamId;
+} cudaStreamGetId_ptsz_v12000_params;
+
+typedef struct cudaStreamCopyAttributes_ptsz_v11000_params_st {
+    cudaStream_t dst;
+    cudaStream_t src;
+} cudaStreamCopyAttributes_ptsz_v11000_params;
+
+typedef struct cudaStreamGetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value_out;
+} cudaStreamGetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamSetAttribute_ptsz_v11000_params_st {
+    cudaStream_t hStream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *value;
+} cudaStreamSetAttribute_ptsz_v11000_params;
+
+typedef struct cudaStreamDestroy_v5050_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v5050_params;
+
+typedef struct cudaStreamWaitEvent_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_ptsz_v7000_params;
+
+typedef struct cudaStreamAddCallback_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_ptsz_v7000_params;
+
+typedef struct cudaStreamSynchronize_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_ptsz_v7000_params;
+
+typedef struct cudaStreamQuery_ptsz_v7000_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_ptsz_v7000_params;
+
+typedef struct cudaStreamAttachMemAsync_ptsz_v7000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_ptsz_v7000_params;
+
+typedef struct cudaStreamBeginCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamBeginCaptureToGraph_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCaptureToGraph_ptsz_v12030_params;
+
+typedef struct cudaThreadExchangeStreamCaptureMode_v10010_params_st {
+    enum cudaStreamCaptureMode *mode;
+} cudaThreadExchangeStreamCaptureMode_v10010_params;
+
+typedef struct cudaStreamEndCapture_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_ptsz_v10000_params;
+
+typedef struct cudaStreamIsCapturing_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_ptsz_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_ptsz_v11030_params;
+
+typedef struct cudaStreamGetCaptureInfo_v3_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    const cudaGraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v3_ptsz_v12030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_ptsz_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_ptsz_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030_params;
+
+typedef struct cudaEventCreate_v3020_params_st {
+    cudaEvent_t *event;
+} cudaEventCreate_v3020_params;
+
+typedef struct cudaEventCreateWithFlags_v3020_params_st {
+    cudaEvent_t *event;
+    unsigned int flags;
+} cudaEventCreateWithFlags_v3020_params;
+
+typedef struct cudaEventRecord_ptsz_v7000_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_ptsz_v7000_params;
+
+typedef struct cudaEventRecordWithFlags_ptsz_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_ptsz_v11010_params;
+
+typedef struct cudaEventQuery_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventQuery_v3020_params;
+
+typedef struct cudaEventSynchronize_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventSynchronize_v3020_params;
+
+typedef struct cudaEventDestroy_v3020_params_st {
+    cudaEvent_t event;
+} cudaEventDestroy_v3020_params;
+
+typedef struct cudaEventElapsedTime_v3020_params_st {
+    float *ms;
+    cudaEvent_t start;
+    cudaEvent_t end;
+} cudaEventElapsedTime_v3020_params;
+
+typedef struct cudaImportExternalMemory_v10000_params_st {
+    cudaExternalMemory_t *extMem_out;
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc;
+} cudaImportExternalMemory_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedBuffer_v10000_params_st {
+    void **devPtr;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryBufferDesc *bufferDesc;
+} cudaExternalMemoryGetMappedBuffer_v10000_params;
+
+typedef struct cudaExternalMemoryGetMappedMipmappedArray_v10000_params_st {
+    cudaMipmappedArray_t *mipmap;
+    cudaExternalMemory_t extMem;
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc;
+} cudaExternalMemoryGetMappedMipmappedArray_v10000_params;
+
+typedef struct cudaDestroyExternalMemory_v10000_params_st {
+    cudaExternalMemory_t extMem;
+} cudaDestroyExternalMemory_v10000_params;
+
+typedef struct cudaImportExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t *extSem_out;
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc;
+} cudaImportExternalSemaphore_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020_params;
+
+typedef struct cudaDestroyExternalSemaphore_v10000_params_st {
+    cudaExternalSemaphore_t extSem;
+} cudaDestroyExternalSemaphore_v10000_params;
+
+typedef struct cudaLaunchKernel_ptsz_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_ptsz_v7000_params;
+
+typedef struct cudaLaunchKernelExC_ptsz_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_ptsz_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_ptsz_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_ptsz_v9000_params;
+
+typedef struct cudaLaunchCooperativeKernelMultiDevice_v9000_params_st {
+    struct cudaLaunchParams *launchParamsList;
+    unsigned int numDevices;
+    unsigned int flags;
+} cudaLaunchCooperativeKernelMultiDevice_v9000_params;
+
+typedef struct cudaFuncSetCacheConfig_v3020_params_st {
+    const void *func;
+    enum cudaFuncCache cacheConfig;
+} cudaFuncSetCacheConfig_v3020_params;
+
+typedef struct cudaFuncGetAttributes_v3020_params_st {
+    struct cudaFuncAttributes *attr;
+    const void *func;
+} cudaFuncGetAttributes_v3020_params;
+
+typedef struct cudaFuncSetAttribute_v9000_params_st {
+    const void *func;
+    enum cudaFuncAttribute attr;
+    int value;
+} cudaFuncSetAttribute_v9000_params;
+
+typedef struct cudaFuncGetName_v12030_params_st {
+    const char **name;
+    const void *func;
+} cudaFuncGetName_v12030_params;
+
+typedef struct cudaFuncGetParamInfo_v12040_params_st {
+    const void *func;
+    size_t paramIndex;
+    size_t *paramOffset;
+    size_t *paramSize;
+} cudaFuncGetParamInfo_v12040_params;
+
+typedef struct cudaLaunchHostFunc_ptsz_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_ptsz_v10000_params;
+
+typedef struct cudaFuncSetSharedMemConfig_v4020_params_st {
+    const void *func;
+    enum cudaSharedMemConfig config;
+} cudaFuncSetSharedMemConfig_v4020_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050_params;
+
+typedef struct cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params_st {
+    size_t *dynamicSmemSize;
+    const void *func;
+    int numBlocks;
+    int blockSize;
+} cudaOccupancyAvailableDynamicSMemPerBlock_v10200_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params_st {
+    int *numBlocks;
+    const void *func;
+    int blockSize;
+    size_t dynamicSMemSize;
+    unsigned int flags;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000_params;
+
+typedef struct cudaOccupancyMaxPotentialClusterSize_v11070_params_st {
+    int *clusterSize;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxPotentialClusterSize_v11070_params;
+
+typedef struct cudaOccupancyMaxActiveClusters_v11070_params_st {
+    int *numClusters;
+    const void *func;
+    const cudaLaunchConfig_t *launchConfig;
+} cudaOccupancyMaxActiveClusters_v11070_params;
+
+typedef struct cudaMallocManaged_v6000_params_st {
+    void **devPtr;
+    size_t size;
+    unsigned int flags;
+} cudaMallocManaged_v6000_params;
+
+typedef struct cudaMalloc_v3020_params_st {
+    void **devPtr;
+    size_t size;
+} cudaMalloc_v3020_params;
+
+typedef struct cudaMallocHost_v3020_params_st {
+    void **ptr;
+    size_t size;
+} cudaMallocHost_v3020_params;
+
+typedef struct cudaMallocPitch_v3020_params_st {
+    void **devPtr;
+    size_t *pitch;
+    size_t width;
+    size_t height;
+} cudaMallocPitch_v3020_params;
+
+typedef struct cudaMallocArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    unsigned int flags;
+} cudaMallocArray_v3020_params;
+
+typedef struct cudaFree_v3020_params_st {
+    void *devPtr;
+} cudaFree_v3020_params;
+
+typedef struct cudaFreeHost_v3020_params_st {
+    void *ptr;
+} cudaFreeHost_v3020_params;
+
+typedef struct cudaFreeArray_v3020_params_st {
+    cudaArray_t array;
+} cudaFreeArray_v3020_params;
+
+typedef struct cudaFreeMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t mipmappedArray;
+} cudaFreeMipmappedArray_v5000_params;
+
+typedef struct cudaHostAlloc_v3020_params_st {
+    void **pHost;
+    size_t size;
+    unsigned int flags;
+} cudaHostAlloc_v3020_params;
+
+typedef struct cudaHostRegister_v4000_params_st {
+    void *ptr;
+    size_t size;
+    unsigned int flags;
+} cudaHostRegister_v4000_params;
+
+typedef struct cudaHostUnregister_v4000_params_st {
+    void *ptr;
+} cudaHostUnregister_v4000_params;
+
+typedef struct cudaHostGetDevicePointer_v3020_params_st {
+    void **pDevice;
+    void *pHost;
+    unsigned int flags;
+} cudaHostGetDevicePointer_v3020_params;
+
+typedef struct cudaHostGetFlags_v3020_params_st {
+    unsigned int *pFlags;
+    void *pHost;
+} cudaHostGetFlags_v3020_params;
+
+typedef struct cudaMalloc3D_v3020_params_st {
+    struct cudaPitchedPtr *pitchedDevPtr;
+    struct cudaExtent extent;
+} cudaMalloc3D_v3020_params;
+
+typedef struct cudaMalloc3DArray_v3020_params_st {
+    cudaArray_t *array;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int flags;
+} cudaMalloc3DArray_v3020_params;
+
+typedef struct cudaMallocMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+    struct cudaExtent extent;
+    unsigned int numLevels;
+    unsigned int flags;
+} cudaMallocMipmappedArray_v5000_params;
+
+typedef struct cudaGetMipmappedArrayLevel_v5000_params_st {
+    cudaArray_t *levelArray;
+    cudaMipmappedArray_const_t mipmappedArray;
+    unsigned int level;
+} cudaGetMipmappedArrayLevel_v5000_params;
+
+typedef struct cudaMemcpy3D_ptds_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DPeer_ptds_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_ptds_v7000_params;
+
+typedef struct cudaMemcpy3DAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy3DPeerAsync_ptsz_v7000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_ptsz_v7000_params;
+
+typedef struct cudaMemGetInfo_v3020_params_st {
+    size_t *free;
+    size_t *total;
+} cudaMemGetInfo_v3020_params;
+
+typedef struct cudaArrayGetInfo_v4010_params_st {
+    struct cudaChannelFormatDesc *desc;
+    struct cudaExtent *extent;
+    unsigned int *flags;
+    cudaArray_t array;
+} cudaArrayGetInfo_v4010_params;
+
+typedef struct cudaArrayGetPlane_v11020_params_st {
+    cudaArray_t *pPlaneArray;
+    cudaArray_t hArray;
+    unsigned int planeIdx;
+} cudaArrayGetPlane_v11020_params;
+
+typedef struct cudaArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaArray_t array;
+    int device;
+} cudaArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaMipmappedArrayGetMemoryRequirements_v11060_params_st {
+    struct cudaArrayMemoryRequirements *memoryRequirements;
+    cudaMipmappedArray_t mipmap;
+    int device;
+} cudaMipmappedArrayGetMemoryRequirements_v11060_params;
+
+typedef struct cudaArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaArray_t array;
+} cudaArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMipmappedArrayGetSparseProperties_v11010_params_st {
+    struct cudaArraySparseProperties *sparseProperties;
+    cudaMipmappedArray_t mipmap;
+} cudaMipmappedArrayGetSparseProperties_v11010_params;
+
+typedef struct cudaMemcpy_ptds_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_ptds_v7000_params;
+
+typedef struct cudaMemcpyPeer_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+} cudaMemcpyPeer_v4000_params;
+
+typedef struct cudaMemcpy2D_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DFromArray_ptds_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpy2DArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToSymbol_ptds_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromSymbol_ptds_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_ptds_v7000_params;
+
+typedef struct cudaMemcpyAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyPeerAsync_v4000_params_st {
+    void *dst;
+    int dstDevice;
+    const void *src;
+    int srcDevice;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemcpyPeerAsync_v4000_params;
+
+typedef struct cudaMemcpy2DAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyToSymbolAsync_ptsz_v7000_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_ptsz_v7000_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset_ptds_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_ptds_v7000_params;
+
+typedef struct cudaMemset2D_ptds_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_ptds_v7000_params;
+
+typedef struct cudaMemset3D_ptds_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_ptds_v7000_params;
+
+typedef struct cudaMemsetAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset2DAsync_ptsz_v7000_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_ptsz_v7000_params;
+
+typedef struct cudaMemset3DAsync_ptsz_v7000_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_ptsz_v7000_params;
+
+typedef struct cudaGetSymbolAddress_v3020_params_st {
+    void **devPtr;
+    const void *symbol;
+} cudaGetSymbolAddress_v3020_params;
+
+typedef struct cudaGetSymbolSize_v3020_params_st {
+    size_t *size;
+    const void *symbol;
+} cudaGetSymbolSize_v3020_params;
+
+typedef struct cudaMemPrefetchAsync_ptsz_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_ptsz_v8000_params;
+
+typedef struct cudaMemPrefetchAsync_v2_ptsz_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    struct cudaMemLocation location;
+    unsigned int flags;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v2_ptsz_v12020_params;
+
+typedef struct cudaMemAdvise_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    int device;
+} cudaMemAdvise_v8000_params;
+
+typedef struct cudaMemAdvise_v2_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    enum cudaMemoryAdvise advice;
+    struct cudaMemLocation location;
+} cudaMemAdvise_v2_v12020_params;
+
+typedef struct cudaMemRangeGetAttribute_v8000_params_st {
+    void *data;
+    size_t dataSize;
+    enum cudaMemRangeAttribute attribute;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttribute_v8000_params;
+
+typedef struct cudaMemRangeGetAttributes_v8000_params_st {
+    void **data;
+    size_t *dataSizes;
+    enum cudaMemRangeAttribute *attributes;
+    size_t numAttributes;
+    const void *devPtr;
+    size_t count;
+} cudaMemRangeGetAttributes_v8000_params;
+
+typedef struct cudaMemcpyToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyFromArray_ptds_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyArrayToArray_ptds_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_ptds_v7000_params;
+
+typedef struct cudaMemcpyToArrayAsync_ptsz_v7000_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMemcpyFromArrayAsync_ptsz_v7000_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_ptsz_v7000_params;
+
+typedef struct cudaMallocAsync_ptsz_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_ptsz_v11020_params;
+
+typedef struct cudaFreeAsync_ptsz_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolTrimTo_v11020_params_st {
+    cudaMemPool_t memPool;
+    size_t minBytesToKeep;
+} cudaMemPoolTrimTo_v11020_params;
+
+typedef struct cudaMemPoolSetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolSetAttribute_v11020_params;
+
+typedef struct cudaMemPoolGetAttribute_v11020_params_st {
+    cudaMemPool_t memPool;
+    enum cudaMemPoolAttr attr;
+    void *value;
+} cudaMemPoolGetAttribute_v11020_params;
+
+typedef struct cudaMemPoolSetAccess_v11020_params_st {
+    cudaMemPool_t memPool;
+    const struct cudaMemAccessDesc *descList;
+    size_t count;
+} cudaMemPoolSetAccess_v11020_params;
+
+typedef struct cudaMemPoolGetAccess_v11020_params_st {
+    enum cudaMemAccessFlags *flags;
+    cudaMemPool_t memPool;
+    struct cudaMemLocation *location;
+} cudaMemPoolGetAccess_v11020_params;
+
+typedef struct cudaMemPoolCreate_v11020_params_st {
+    cudaMemPool_t *memPool;
+    const struct cudaMemPoolProps *poolProps;
+} cudaMemPoolCreate_v11020_params;
+
+typedef struct cudaMemPoolDestroy_v11020_params_st {
+    cudaMemPool_t memPool;
+} cudaMemPoolDestroy_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_ptsz_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_ptsz_v11020_params;
+
+typedef struct cudaMemPoolExportToShareableHandle_v11020_params_st {
+    void *shareableHandle;
+    cudaMemPool_t memPool;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolExportToShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolImportFromShareableHandle_v11020_params_st {
+    cudaMemPool_t *memPool;
+    void *shareableHandle;
+    enum cudaMemAllocationHandleType handleType;
+    unsigned int flags;
+} cudaMemPoolImportFromShareableHandle_v11020_params;
+
+typedef struct cudaMemPoolExportPointer_v11020_params_st {
+    struct cudaMemPoolPtrExportData *exportData;
+    void *ptr;
+} cudaMemPoolExportPointer_v11020_params;
+
+typedef struct cudaMemPoolImportPointer_v11020_params_st {
+    void **ptr;
+    cudaMemPool_t memPool;
+    struct cudaMemPoolPtrExportData *exportData;
+} cudaMemPoolImportPointer_v11020_params;
+
+typedef struct cudaPointerGetAttributes_v4000_params_st {
+    struct cudaPointerAttributes *attributes;
+    const void *ptr;
+} cudaPointerGetAttributes_v4000_params;
+
+typedef struct cudaDeviceCanAccessPeer_v4000_params_st {
+    int *canAccessPeer;
+    int device;
+    int peerDevice;
+} cudaDeviceCanAccessPeer_v4000_params;
+
+typedef struct cudaDeviceEnablePeerAccess_v4000_params_st {
+    int peerDevice;
+    unsigned int flags;
+} cudaDeviceEnablePeerAccess_v4000_params;
+
+typedef struct cudaDeviceDisablePeerAccess_v4000_params_st {
+    int peerDevice;
+} cudaDeviceDisablePeerAccess_v4000_params;
+
+typedef struct cudaGraphicsUnregisterResource_v3020_params_st {
+    cudaGraphicsResource_t resource;
+} cudaGraphicsUnregisterResource_v3020_params;
+
+typedef struct cudaGraphicsResourceSetMapFlags_v3020_params_st {
+    cudaGraphicsResource_t resource;
+    unsigned int flags;
+} cudaGraphicsResourceSetMapFlags_v3020_params;
+
+typedef struct cudaGraphicsMapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsMapResources_v3020_params;
+
+typedef struct cudaGraphicsUnmapResources_v3020_params_st {
+    int count;
+    cudaGraphicsResource_t *resources;
+    cudaStream_t stream;
+} cudaGraphicsUnmapResources_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedPointer_v3020_params_st {
+    void **devPtr;
+    size_t *size;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedPointer_v3020_params;
+
+typedef struct cudaGraphicsSubResourceGetMappedArray_v3020_params_st {
+    cudaArray_t *array;
+    cudaGraphicsResource_t resource;
+    unsigned int arrayIndex;
+    unsigned int mipLevel;
+} cudaGraphicsSubResourceGetMappedArray_v3020_params;
+
+typedef struct cudaGraphicsResourceGetMappedMipmappedArray_v5000_params_st {
+    cudaMipmappedArray_t *mipmappedArray;
+    cudaGraphicsResource_t resource;
+} cudaGraphicsResourceGetMappedMipmappedArray_v5000_params;
+
+typedef struct cudaGetChannelDesc_v3020_params_st {
+    struct cudaChannelFormatDesc *desc;
+    cudaArray_const_t array;
+} cudaGetChannelDesc_v3020_params;
+
+typedef struct cudaCreateChannelDesc_v3020_params_st {
+    int x;
+    int y;
+    int z;
+    int w;
+    enum cudaChannelFormatKind f;
+} cudaCreateChannelDesc_v3020_params;
+
+typedef struct cudaCreateTextureObject_v5000_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v5000_params;
+
+typedef struct cudaDestroyTextureObject_v5000_params_st {
+    cudaTextureObject_t texObject;
+} cudaDestroyTextureObject_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v5000_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v5000_params;
+
+typedef struct cudaGetTextureObjectResourceViewDesc_v5000_params_st {
+    struct cudaResourceViewDesc *pResViewDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectResourceViewDesc_v5000_params;
+
+typedef struct cudaCreateSurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t *pSurfObject;
+    const struct cudaResourceDesc *pResDesc;
+} cudaCreateSurfaceObject_v5000_params;
+
+typedef struct cudaDestroySurfaceObject_v5000_params_st {
+    cudaSurfaceObject_t surfObject;
+} cudaDestroySurfaceObject_v5000_params;
+
+typedef struct cudaGetSurfaceObjectResourceDesc_v5000_params_st {
+    struct cudaResourceDesc *pResDesc;
+    cudaSurfaceObject_t surfObject;
+} cudaGetSurfaceObjectResourceDesc_v5000_params;
+
+typedef struct cudaDriverGetVersion_v3020_params_st {
+    int *driverVersion;
+} cudaDriverGetVersion_v3020_params;
+
+typedef struct cudaRuntimeGetVersion_v3020_params_st {
+    int *runtimeVersion;
+} cudaRuntimeGetVersion_v3020_params;
+
+typedef struct cudaGraphCreate_v10000_params_st {
+    cudaGraph_t *pGraph;
+    unsigned int flags;
+} cudaGraphCreate_v10000_params;
+
+typedef struct cudaGraphAddKernelNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphAddKernelNode_v10000_params;
+
+typedef struct cudaGraphKernelNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeGetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphKernelNodeSetParams_v10000_params;
+
+typedef struct cudaGraphKernelNodeCopyAttributes_v11000_params_st {
+    cudaGraphNode_t hSrc;
+    cudaGraphNode_t hDst;
+} cudaGraphKernelNodeCopyAttributes_v11000_params;
+
+typedef struct cudaGraphKernelNodeGetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    cudaKernelNodeAttrValue *value_out;
+} cudaGraphKernelNodeGetAttribute_v11000_params;
+
+typedef struct cudaGraphKernelNodeSetAttribute_v11000_params_st {
+    cudaGraphNode_t hNode;
+    cudaKernelNodeAttrID attr;
+    const cudaKernelNodeAttrValue *value;
+} cudaGraphKernelNodeSetAttribute_v11000_params;
+
+typedef struct cudaGraphAddMemcpyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemcpy3DParms *pCopyParams;
+} cudaGraphAddMemcpyNode_v10000_params;
+
+typedef struct cudaGraphAddMemcpyNodeToSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeToSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNodeFromSymbol_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNodeFromSymbol_v11010_params;
+
+typedef struct cudaGraphAddMemcpyNode1D_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphAddMemcpyNode1D_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphMemcpyNodeSetParams_v10000_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphAddMemsetNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaMemsetParams *pMemsetParams;
+} cudaGraphAddMemsetNode_v10000_params;
+
+typedef struct cudaGraphMemsetNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeGetParams_v10000_params;
+
+typedef struct cudaGraphMemsetNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphMemsetNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddHostNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphAddHostNode_v10000_params;
+
+typedef struct cudaGraphHostNodeGetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeGetParams_v10000_params;
+
+typedef struct cudaGraphHostNodeSetParams_v10000_params_st {
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphHostNodeSetParams_v10000_params;
+
+typedef struct cudaGraphAddChildGraphNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaGraph_t childGraph;
+} cudaGraphAddChildGraphNode_v10000_params;
+
+typedef struct cudaGraphChildGraphNodeGetGraph_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraph_t *pGraph;
+} cudaGraphChildGraphNodeGetGraph_v10000_params;
+
+typedef struct cudaGraphAddEmptyNode_v10000_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+} cudaGraphAddEmptyNode_v10000_params;
+
+typedef struct cudaGraphAddEventRecordNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventRecordNode_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventRecordNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddEventWaitNode_v11010_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    cudaEvent_t event;
+} cudaGraphAddEventWaitNode_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeGetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t *event_out;
+} cudaGraphEventWaitNodeGetEvent_v11010_params;
+
+typedef struct cudaGraphEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphNode_t node;
+    cudaEvent_t event;
+} cudaGraphEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphAddExternalSemaphoresSignalNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresSignalNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreSignalNodeParams *params_out;
+} cudaGraphExternalSemaphoresSignalNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddExternalSemaphoresWaitNode_v11020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphAddExternalSemaphoresWaitNode_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    struct cudaExternalSemaphoreWaitNodeParams *params_out;
+} cudaGraphExternalSemaphoresWaitNodeGetParams_v11020_params;
+
+typedef struct cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphAddMemAllocNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaMemAllocNodeParams *nodeParams;
+} cudaGraphAddMemAllocNode_v11040_params;
+
+typedef struct cudaGraphMemAllocNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    struct cudaMemAllocNodeParams *params_out;
+} cudaGraphMemAllocNodeGetParams_v11040_params;
+
+typedef struct cudaGraphAddMemFreeNode_v11040_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    void *dptr;
+} cudaGraphAddMemFreeNode_v11040_params;
+
+typedef struct cudaGraphMemFreeNodeGetParams_v11040_params_st {
+    cudaGraphNode_t node;
+    void *dptr_out;
+} cudaGraphMemFreeNodeGetParams_v11040_params;
+
+typedef struct cudaDeviceGraphMemTrim_v11040_params_st {
+    int device;
+} cudaDeviceGraphMemTrim_v11040_params;
+
+typedef struct cudaDeviceGetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceGetGraphMemAttribute_v11040_params;
+
+typedef struct cudaDeviceSetGraphMemAttribute_v11040_params_st {
+    int device;
+    enum cudaGraphMemAttributeType attr;
+    void *value;
+} cudaDeviceSetGraphMemAttribute_v11040_params;
+
+typedef struct cudaGraphClone_v10000_params_st {
+    cudaGraph_t *pGraphClone;
+    cudaGraph_t originalGraph;
+} cudaGraphClone_v10000_params;
+
+typedef struct cudaGraphNodeFindInClone_v10000_params_st {
+    cudaGraphNode_t *pNode;
+    cudaGraphNode_t originalNode;
+    cudaGraph_t clonedGraph;
+} cudaGraphNodeFindInClone_v10000_params;
+
+typedef struct cudaGraphNodeGetType_v10000_params_st {
+    cudaGraphNode_t node;
+    enum cudaGraphNodeType *pType;
+} cudaGraphNodeGetType_v10000_params;
+
+typedef struct cudaGraphGetNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *nodes;
+    size_t *numNodes;
+} cudaGraphGetNodes_v10000_params;
+
+typedef struct cudaGraphGetRootNodes_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *pRootNodes;
+    size_t *pNumRootNodes;
+} cudaGraphGetRootNodes_v10000_params;
+
+typedef struct cudaGraphGetEdges_v10000_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    size_t *numEdges;
+} cudaGraphGetEdges_v10000_params;
+
+typedef struct cudaGraphGetEdges_v2_v12030_params_st {
+    cudaGraph_t graph;
+    cudaGraphNode_t *from;
+    cudaGraphNode_t *to;
+    cudaGraphEdgeData *edgeData;
+    size_t *numEdges;
+} cudaGraphGetEdges_v2_v12030_params;
+
+typedef struct cudaGraphNodeGetDependencies_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v10000_params;
+
+typedef struct cudaGraphNodeGetDependencies_v2_v12030_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependencies;
+    cudaGraphEdgeData *edgeData;
+    size_t *pNumDependencies;
+} cudaGraphNodeGetDependencies_v2_v12030_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v10000_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v10000_params;
+
+typedef struct cudaGraphNodeGetDependentNodes_v2_v12030_params_st {
+    cudaGraphNode_t node;
+    cudaGraphNode_t *pDependentNodes;
+    cudaGraphEdgeData *edgeData;
+    size_t *pNumDependentNodes;
+} cudaGraphNodeGetDependentNodes_v2_v12030_params;
+
+typedef struct cudaGraphAddDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v10000_params;
+
+typedef struct cudaGraphAddDependencies_v2_v12030_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    const cudaGraphEdgeData *edgeData;
+    size_t numDependencies;
+} cudaGraphAddDependencies_v2_v12030_params;
+
+typedef struct cudaGraphRemoveDependencies_v10000_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v10000_params;
+
+typedef struct cudaGraphRemoveDependencies_v2_v12030_params_st {
+    cudaGraph_t graph;
+    const cudaGraphNode_t *from;
+    const cudaGraphNode_t *to;
+    const cudaGraphEdgeData *edgeData;
+    size_t numDependencies;
+} cudaGraphRemoveDependencies_v2_v12030_params;
+
+typedef struct cudaGraphDestroyNode_v10000_params_st {
+    cudaGraphNode_t node;
+} cudaGraphDestroyNode_v10000_params;
+
+typedef struct cudaGraphInstantiate_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiate_v12000_params;
+
+typedef struct cudaGraphInstantiateWithFlags_v11040_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    unsigned long long flags;
+} cudaGraphInstantiateWithFlags_v11040_params;
+
+typedef struct cudaGraphInstantiateWithParams_ptsz_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphInstantiateParams *instantiateParams;
+} cudaGraphInstantiateWithParams_ptsz_v12000_params;
+
+typedef struct cudaGraphExecGetFlags_v12000_params_st {
+    cudaGraphExec_t graphExec;
+    unsigned long long *flags;
+} cudaGraphExecGetFlags_v12000_params;
+
+typedef struct cudaGraphExecKernelNodeSetParams_v10010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaKernelNodeParams *pNodeParams;
+} cudaGraphExecKernelNodeSetParams_v10010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemcpy3DParms *pNodeParams;
+} cudaGraphExecMemcpyNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010_params;
+
+typedef struct cudaGraphExecMemcpyNodeSetParams1D_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaGraphExecMemcpyNodeSetParams1D_v11010_params;
+
+typedef struct cudaGraphExecMemsetNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaMemsetParams *pNodeParams;
+} cudaGraphExecMemsetNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecHostNodeSetParams_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    const struct cudaHostNodeParams *pNodeParams;
+} cudaGraphExecHostNodeSetParams_v10020_params;
+
+typedef struct cudaGraphExecChildGraphNodeSetParams_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t node;
+    cudaGraph_t childGraph;
+} cudaGraphExecChildGraphNodeSetParams_v11010_params;
+
+typedef struct cudaGraphExecEventRecordNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventRecordNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecEventWaitNodeSetEvent_v11010_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    cudaEvent_t event;
+} cudaGraphExecEventWaitNodeSetEvent_v11010_params;
+
+typedef struct cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreSignalNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020_params;
+
+typedef struct cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    const struct cudaExternalSemaphoreWaitNodeParams *nodeParams;
+} cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020_params;
+
+typedef struct cudaGraphNodeSetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int isEnabled;
+} cudaGraphNodeSetEnabled_v11060_params;
+
+typedef struct cudaGraphNodeGetEnabled_v11060_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraphNode_t hNode;
+    unsigned int *isEnabled;
+} cudaGraphNodeGetEnabled_v11060_params;
+
+typedef struct cudaGraphExecUpdate_v10020_params_st {
+    cudaGraphExec_t hGraphExec;
+    cudaGraph_t hGraph;
+    cudaGraphExecUpdateResultInfo *resultInfo;
+} cudaGraphExecUpdate_v10020_params;
+
+typedef struct cudaGraphUpload_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_ptsz_v10000_params;
+
+typedef struct cudaGraphLaunch_ptsz_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_ptsz_v10000_params;
+
+typedef struct cudaGraphExecDestroy_v10000_params_st {
+    cudaGraphExec_t graphExec;
+} cudaGraphExecDestroy_v10000_params;
+
+typedef struct cudaGraphDestroy_v10000_params_st {
+    cudaGraph_t graph;
+} cudaGraphDestroy_v10000_params;
+
+typedef struct cudaGraphDebugDotPrint_v11030_params_st {
+    cudaGraph_t graph;
+    const char *path;
+    unsigned int flags;
+} cudaGraphDebugDotPrint_v11030_params;
+
+typedef struct cudaUserObjectCreate_v11030_params_st {
+    cudaUserObject_t *object_out;
+    void *ptr;
+    cudaHostFn_t destroy;
+    unsigned int initialRefcount;
+    unsigned int flags;
+} cudaUserObjectCreate_v11030_params;
+
+typedef struct cudaUserObjectRetain_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRetain_v11030_params;
+
+typedef struct cudaUserObjectRelease_v11030_params_st {
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaUserObjectRelease_v11030_params;
+
+typedef struct cudaGraphRetainUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+    unsigned int flags;
+} cudaGraphRetainUserObject_v11030_params;
+
+typedef struct cudaGraphReleaseUserObject_v11030_params_st {
+    cudaGraph_t graph;
+    cudaUserObject_t object;
+    unsigned int count;
+} cudaGraphReleaseUserObject_v11030_params;
+
+typedef struct cudaGraphAddNode_v12020_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    size_t numDependencies;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphAddNode_v12020_params;
+
+typedef struct cudaGraphAddNode_v2_v12030_params_st {
+    cudaGraphNode_t *pGraphNode;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *pDependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphAddNode_v2_v12030_params;
+
+typedef struct cudaGraphNodeSetParams_v12020_params_st {
+    cudaGraphNode_t node;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphNodeSetParams_v12020_params;
+
+typedef struct cudaGraphExecNodeSetParams_v12020_params_st {
+    cudaGraphExec_t graphExec;
+    cudaGraphNode_t node;
+    struct cudaGraphNodeParams *nodeParams;
+} cudaGraphExecNodeSetParams_v12020_params;
+
+typedef struct cudaGraphConditionalHandleCreate_v12030_params_st {
+    cudaGraphConditionalHandle *pHandle_out;
+    cudaGraph_t graph;
+    unsigned int defaultLaunchValue;
+    unsigned int flags;
+} cudaGraphConditionalHandleCreate_v12030_params;
+
+typedef struct cudaGetDriverEntryPoint_ptsz_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPoint_ptsz_v11030_params;
+
+typedef struct cudaGetFuncBySymbol_v11000_params_st {
+    cudaFunction_t *functionPtr;
+    const void *symbolPtr;
+} cudaGetFuncBySymbol_v11000_params;
+
+typedef struct cudaGetKernel_v12000_params_st {
+    cudaKernel_t *kernelPtr;
+    const void *entryFuncAddr;
+} cudaGetKernel_v12000_params;
+
+typedef struct cudaMemcpy_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy_v3020_params;
+
+typedef struct cudaMemcpyToSymbol_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToSymbol_v3020_params;
+
+typedef struct cudaMemcpyFromSymbol_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromSymbol_v3020_params;
+
+typedef struct cudaMemcpy2D_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2D_v3020_params;
+
+typedef struct cudaMemcpyToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyToArray_v3020_params;
+
+typedef struct cudaMemcpy2DToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DToArray_v3020_params;
+
+typedef struct cudaMemcpyFromArray_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyFromArray_v3020_params;
+
+typedef struct cudaMemcpy2DFromArray_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DFromArray_v3020_params;
+
+typedef struct cudaMemcpyArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t count;
+    enum cudaMemcpyKind kind;
+} cudaMemcpyArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy2DArrayToArray_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffsetDst;
+    size_t hOffsetDst;
+    cudaArray_const_t src;
+    size_t wOffsetSrc;
+    size_t hOffsetSrc;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+} cudaMemcpy2DArrayToArray_v3020_params;
+
+typedef struct cudaMemcpy3D_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+} cudaMemcpy3D_v3020_params;
+
+typedef struct cudaMemcpy3DPeer_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+} cudaMemcpy3DPeer_v4000_params;
+
+typedef struct cudaMemset_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+} cudaMemset_v3020_params;
+
+typedef struct cudaMemset2D_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+} cudaMemset2D_v3020_params;
+
+typedef struct cudaMemset3D_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+} cudaMemset3D_v3020_params;
+
+typedef struct cudaMemcpyAsync_v3020_params_st {
+    void *dst;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyAsync_v3020_params;
+
+typedef struct cudaMemcpyToSymbolAsync_v3020_params_st {
+    const void *symbol;
+    const void *src;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpyFromSymbolAsync_v3020_params_st {
+    void *dst;
+    const void *symbol;
+    size_t count;
+    size_t offset;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromSymbolAsync_v3020_params;
+
+typedef struct cudaMemcpy2DAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DAsync_v3020_params;
+
+typedef struct cudaMemcpyToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DToArrayAsync_v3020_params_st {
+    cudaArray_t dst;
+    size_t wOffset;
+    size_t hOffset;
+    const void *src;
+    size_t spitch;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DToArrayAsync_v3020_params;
+
+typedef struct cudaMemcpyFromArrayAsync_v3020_params_st {
+    void *dst;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t count;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpyFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy2DFromArrayAsync_v3020_params_st {
+    void *dst;
+    size_t dpitch;
+    cudaArray_const_t src;
+    size_t wOffset;
+    size_t hOffset;
+    size_t width;
+    size_t height;
+    enum cudaMemcpyKind kind;
+    cudaStream_t stream;
+} cudaMemcpy2DFromArrayAsync_v3020_params;
+
+typedef struct cudaMemcpy3DAsync_v3020_params_st {
+    const struct cudaMemcpy3DParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DAsync_v3020_params;
+
+typedef struct cudaMemcpy3DPeerAsync_v4000_params_st {
+    const struct cudaMemcpy3DPeerParms *p;
+    cudaStream_t stream;
+} cudaMemcpy3DPeerAsync_v4000_params;
+
+typedef struct cudaMemsetAsync_v3020_params_st {
+    void *devPtr;
+    int value;
+    size_t count;
+    cudaStream_t stream;
+} cudaMemsetAsync_v3020_params;
+
+typedef struct cudaMemset2DAsync_v3020_params_st {
+    void *devPtr;
+    size_t pitch;
+    int value;
+    size_t width;
+    size_t height;
+    cudaStream_t stream;
+} cudaMemset2DAsync_v3020_params;
+
+typedef struct cudaMemset3DAsync_v3020_params_st {
+    struct cudaPitchedPtr pitchedDevPtr;
+    int value;
+    struct cudaExtent extent;
+    cudaStream_t stream;
+} cudaMemset3DAsync_v3020_params;
+
+typedef struct cudaStreamQuery_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamQuery_v3020_params;
+
+typedef struct cudaStreamGetFlags_v5050_params_st {
+    cudaStream_t hStream;
+    unsigned int *flags;
+} cudaStreamGetFlags_v5050_params;
+
+typedef struct cudaStreamGetId_v12000_params_st {
+    cudaStream_t hStream;
+    unsigned long long *streamId;
+} cudaStreamGetId_v12000_params;
+
+typedef struct cudaStreamGetPriority_v5050_params_st {
+    cudaStream_t hStream;
+    int *priority;
+} cudaStreamGetPriority_v5050_params;
+
+typedef struct cudaEventRecord_v3020_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+} cudaEventRecord_v3020_params;
+
+typedef struct cudaEventRecordWithFlags_v11010_params_st {
+    cudaEvent_t event;
+    cudaStream_t stream;
+    unsigned int flags;
+} cudaEventRecordWithFlags_v11010_params;
+
+typedef struct cudaStreamWaitEvent_v3020_params_st {
+    cudaStream_t stream;
+    cudaEvent_t event;
+    unsigned int flags;
+} cudaStreamWaitEvent_v3020_params;
+
+typedef struct cudaStreamAddCallback_v5000_params_st {
+    cudaStream_t stream;
+    cudaStreamCallback_t callback;
+    void *userData;
+    unsigned int flags;
+} cudaStreamAddCallback_v5000_params;
+
+typedef struct cudaStreamAttachMemAsync_v6000_params_st {
+    cudaStream_t stream;
+    void *devPtr;
+    size_t length;
+    unsigned int flags;
+} cudaStreamAttachMemAsync_v6000_params;
+
+typedef struct cudaStreamSynchronize_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamSynchronize_v3020_params;
+
+typedef struct cudaLaunchKernel_v7000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchKernel_v7000_params;
+
+typedef struct cudaLaunchKernelExC_v11060_params_st {
+    const cudaLaunchConfig_t *config;
+    const void *func;
+    void **args;
+} cudaLaunchKernelExC_v11060_params;
+
+typedef struct cudaLaunchCooperativeKernel_v9000_params_st {
+    const void *func;
+    dim3 gridDim;
+    dim3 blockDim;
+    void **args;
+    size_t sharedMem;
+    cudaStream_t stream;
+} cudaLaunchCooperativeKernel_v9000_params;
+
+typedef struct cudaLaunchHostFunc_v10000_params_st {
+    cudaStream_t stream;
+    cudaHostFn_t fn;
+    void *userData;
+} cudaLaunchHostFunc_v10000_params;
+
+typedef struct cudaMemPrefetchAsync_v8000_params_st {
+    const void *devPtr;
+    size_t count;
+    int dstDevice;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v8000_params;
+
+typedef struct cudaMemPrefetchAsync_v2_v12020_params_st {
+    const void *devPtr;
+    size_t count;
+    struct cudaMemLocation location;
+    unsigned int flags;
+    cudaStream_t stream;
+} cudaMemPrefetchAsync_v2_v12020_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaSignalExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreSignalParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaSignalExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_ptsz_v10000_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_ptsz_v10000_params;
+
+typedef struct cudaWaitExternalSemaphoresAsync_v2_v11020_params_st {
+    const cudaExternalSemaphore_t *extSemArray;
+    const struct cudaExternalSemaphoreWaitParams *paramsArray;
+    unsigned int numExtSems;
+    cudaStream_t stream;
+} cudaWaitExternalSemaphoresAsync_v2_v11020_params;
+
+typedef struct cudaGraphInstantiateWithParams_v12000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphInstantiateParams *instantiateParams;
+} cudaGraphInstantiateWithParams_v12000_params;
+
+typedef struct cudaGraphUpload_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphUpload_v10000_params;
+
+typedef struct cudaGraphLaunch_v10000_params_st {
+    cudaGraphExec_t graphExec;
+    cudaStream_t stream;
+} cudaGraphLaunch_v10000_params;
+
+typedef struct cudaStreamBeginCapture_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCapture_v10000_params;
+
+typedef struct cudaStreamBeginCaptureToGraph_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraph_t graph;
+    const cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    enum cudaStreamCaptureMode mode;
+} cudaStreamBeginCaptureToGraph_v12030_params;
+
+typedef struct cudaStreamEndCapture_v10000_params_st {
+    cudaStream_t stream;
+    cudaGraph_t *pGraph;
+} cudaStreamEndCapture_v10000_params;
+
+typedef struct cudaStreamIsCapturing_v10000_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *pCaptureStatus;
+} cudaStreamIsCapturing_v10000_params;
+
+typedef struct cudaStreamGetCaptureInfo_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_ptsz_v10010_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+} cudaStreamGetCaptureInfo_ptsz_v10010_params;
+
+typedef struct cudaStreamGetCaptureInfo_v2_v11030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v2_v11030_params;
+
+typedef struct cudaStreamGetCaptureInfo_v3_v12030_params_st {
+    cudaStream_t stream;
+    enum cudaStreamCaptureStatus *captureStatus_out;
+    unsigned long long *id_out;
+    cudaGraph_t *graph_out;
+    const cudaGraphNode_t **dependencies_out;
+    const cudaGraphEdgeData **edgeData_out;
+    size_t *numDependencies_out;
+} cudaStreamGetCaptureInfo_v3_v12030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v11030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v11030_params;
+
+typedef struct cudaStreamUpdateCaptureDependencies_v2_v12030_params_st {
+    cudaStream_t stream;
+    cudaGraphNode_t *dependencies;
+    const cudaGraphEdgeData *dependencyData;
+    size_t numDependencies;
+    unsigned int flags;
+} cudaStreamUpdateCaptureDependencies_v2_v12030_params;
+
+typedef struct cudaStreamCopyAttributes_v11000_params_st {
+    cudaStream_t dstStream;
+    cudaStream_t srcStream;
+} cudaStreamCopyAttributes_v11000_params;
+
+typedef struct cudaStreamGetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    cudaStreamAttrValue *value;
+} cudaStreamGetAttribute_v11000_params;
+
+typedef struct cudaStreamSetAttribute_v11000_params_st {
+    cudaStream_t stream;
+    cudaStreamAttrID attr;
+    const cudaStreamAttrValue *param;
+} cudaStreamSetAttribute_v11000_params;
+
+typedef struct cudaMallocAsync_v11020_params_st {
+    void **devPtr;
+    size_t size;
+    cudaStream_t hStream;
+} cudaMallocAsync_v11020_params;
+
+typedef struct cudaFreeAsync_v11020_params_st {
+    void *devPtr;
+    cudaStream_t hStream;
+} cudaFreeAsync_v11020_params;
+
+typedef struct cudaMallocFromPoolAsync_v11020_params_st {
+    void **ptr;
+    size_t size;
+    cudaMemPool_t memPool;
+    cudaStream_t stream;
+} cudaMallocFromPoolAsync_v11020_params;
+
+typedef struct cudaGetDriverEntryPoint_v11030_params_st {
+    const char *symbol;
+    void **funcPtr;
+    unsigned long long flags;
+    enum cudaDriverEntryPointQueryResult *driverStatus;
+} cudaGetDriverEntryPoint_v11030_params;
+
+typedef struct cudaGetDeviceProperties_v3020_params_st {
+    struct cudaDeviceProp *prop;
+    int device;
+} cudaGetDeviceProperties_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..88e79d1957925c4bbacd381e9461d5072de88f24
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cuda_vdpau_interop_meta.h
@@ -0,0 +1,38 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cuda_vdpau_interop.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaVDPAUGetDevice_v3020_params_st {
+    int *device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUGetDevice_v3020_params;
+
+typedef struct cudaVDPAUSetVDPAUDevice_v3020_params_st {
+    int device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cudaVDPAUSetVDPAUDevice_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterVideoSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterVideoSurface_v3020_params;
+
+typedef struct cudaGraphicsVDPAURegisterOutputSurface_v3020_params_st {
+    struct cudaGraphicsResource **resource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cudaGraphicsVDPAURegisterOutputSurface_v3020_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0fc27a71bb3fc883db9fe7562eea3f28145430d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_cudart_removed_meta.h
@@ -0,0 +1,162 @@
+// This file is generated.  Any changes you make will be lost during the next clean build.
+
+// CUDA public interface, for type definitions and api function prototypes
+#include "cudart_removed.h"
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+// Currently used parameter trace structures
+typedef struct cudaStreamDestroy_v3020_params_st {
+    cudaStream_t stream;
+} cudaStreamDestroy_v3020_params;
+
+typedef struct cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params_st {
+    int *numBlocks;
+    const void *func;
+    size_t numDynamicSmemBytes;
+} cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000_params;
+
+typedef struct cudaConfigureCall_v3020_params_st {
+    dim3 gridDim;
+    dim3 blockDim;
+    size_t sharedMem  __dv;
+    cudaStream_t stream  __dv;
+} cudaConfigureCall_v3020_params;
+
+typedef struct cudaSetupArgument_v3020_params_st {
+    const void *arg;
+    size_t size;
+    size_t offset;
+} cudaSetupArgument_v3020_params;
+
+typedef struct cudaLaunch_v3020_params_st {
+    const void *func;
+} cudaLaunch_v3020_params;
+
+typedef struct cudaLaunch_ptsz_v7000_params_st {
+    const void *func;
+} cudaLaunch_ptsz_v7000_params;
+
+typedef struct cudaStreamSetFlags_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_v10200_params;
+
+typedef struct cudaStreamSetFlags_ptsz_v10200_params_st {
+    cudaStream_t hStream;
+    unsigned int flags;
+} cudaStreamSetFlags_ptsz_v10200_params;
+
+typedef struct cudaProfilerInitialize_v4000_params_st {
+    const char *configFile;
+    const char *outputFile;
+    cudaOutputMode_t outputMode;
+} cudaProfilerInitialize_v4000_params;
+
+typedef struct cudaThreadSetLimit_v3020_params_st {
+    enum cudaLimit limit;
+    size_t value;
+} cudaThreadSetLimit_v3020_params;
+
+typedef struct cudaThreadGetLimit_v3020_params_st {
+    size_t *pValue;
+    enum cudaLimit limit;
+} cudaThreadGetLimit_v3020_params;
+
+typedef struct cudaThreadGetCacheConfig_v3020_params_st {
+    enum cudaFuncCache *pCacheConfig;
+} cudaThreadGetCacheConfig_v3020_params;
+
+typedef struct cudaThreadSetCacheConfig_v3020_params_st {
+    enum cudaFuncCache cacheConfig;
+} cudaThreadSetCacheConfig_v3020_params;
+
+typedef struct cudaSetDoubleForDevice_v3020_params_st {
+    double *d;
+} cudaSetDoubleForDevice_v3020_params;
+
+typedef struct cudaSetDoubleForHost_v3020_params_st {
+    double *d;
+} cudaSetDoubleForHost_v3020_params;
+
+typedef struct cudaCreateTextureObject_v2_v11080_params_st {
+    cudaTextureObject_t *pTexObject;
+    const struct cudaResourceDesc *pResDesc;
+    const struct cudaTextureDesc *pTexDesc;
+    const struct cudaResourceViewDesc *pResViewDesc;
+} cudaCreateTextureObject_v2_v11080_params;
+
+typedef struct cudaGetTextureObjectTextureDesc_v2_v11080_params_st {
+    struct cudaTextureDesc *pTexDesc;
+    cudaTextureObject_t texObject;
+} cudaGetTextureObjectTextureDesc_v2_v11080_params;
+
+typedef struct cudaBindTexture_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t size  __dv;
+} cudaBindTexture_v3020_params;
+
+typedef struct cudaBindTexture2D_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+    const void *devPtr;
+    const struct cudaChannelFormatDesc *desc;
+    size_t width;
+    size_t height;
+    size_t pitch;
+} cudaBindTexture2D_v3020_params;
+
+typedef struct cudaBindTextureToArray_v3020_params_st {
+    const struct textureReference *texref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToArray_v3020_params;
+
+typedef struct cudaBindTextureToMipmappedArray_v5000_params_st {
+    const struct textureReference *texref;
+    cudaMipmappedArray_const_t mipmappedArray;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindTextureToMipmappedArray_v5000_params;
+
+typedef struct cudaUnbindTexture_v3020_params_st {
+    const struct textureReference *texref;
+} cudaUnbindTexture_v3020_params;
+
+typedef struct cudaGetTextureAlignmentOffset_v3020_params_st {
+    size_t *offset;
+    const struct textureReference *texref;
+} cudaGetTextureAlignmentOffset_v3020_params;
+
+typedef struct cudaGetTextureReference_v3020_params_st {
+    const struct textureReference **texref;
+    const void *symbol;
+} cudaGetTextureReference_v3020_params;
+
+typedef struct cudaBindSurfaceToArray_v3020_params_st {
+    const struct surfaceReference *surfref;
+    cudaArray_const_t array;
+    const struct cudaChannelFormatDesc *desc;
+} cudaBindSurfaceToArray_v3020_params;
+
+typedef struct cudaGetSurfaceReference_v3020_params_st {
+    const struct surfaceReference **surfref;
+    const void *symbol;
+} cudaGetSurfaceReference_v3020_params;
+
+typedef struct cudaGraphInstantiate_v10000_params_st {
+    cudaGraphExec_t *pGraphExec;
+    cudaGraph_t graph;
+    cudaGraphNode_t *pErrorNode;
+    char *pLogBuffer;
+    size_t bufferSize;
+} cudaGraphInstantiate_v10000_params;
+
+// Parameter trace structures for removed functions
+
+
+// End of parameter trace structures
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_nvtx_meta.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_nvtx_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed8877e21f0651fe1564151090850694eb495cfb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/generated_nvtx_meta.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright 2013-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+
+typedef struct nvtxMarkEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxMarkEx_params;
+
+typedef struct nvtxMarkA_params_st {
+  const char* message;
+} nvtxMarkA_params;
+
+typedef struct nvtxMarkW_params_st {
+  const wchar_t* message;
+} nvtxMarkW_params;
+
+typedef struct nvtxRangeStartEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxRangeStartEx_params;
+
+typedef struct nvtxRangeStartA_params_st {
+  const char* message;
+} nvtxRangeStartA_params;
+
+typedef struct nvtxRangeStartW_params_st {
+  const wchar_t* message;
+} nvtxRangeStartW_params;
+
+typedef struct nvtxRangeEnd_params_st {
+  nvtxRangeId_t id;
+} nvtxRangeEnd_params;
+
+typedef struct nvtxRangePushEx_params_st {
+  const nvtxEventAttributes_t* eventAttrib;
+} nvtxRangePushEx_params;
+
+typedef struct nvtxRangePushA_params_st {
+  const char* message;
+} nvtxRangePushA_params;
+
+typedef struct nvtxRangePushW_params_st {
+  const wchar_t* message;
+} nvtxRangePushW_params;
+
+typedef struct nvtxRangePop_params_st {
+  /* WAR: Windows compiler doesn't allow empty structs */
+  /* This field shouldn't be used */
+  void *dummy;
+} nvtxRangePop_params;
+
+typedef struct nvtxNameCategoryA_params_st {
+  uint32_t category;
+  const char* name;
+} nvtxNameCategoryA_params;
+
+typedef struct nvtxNameCategoryW_params_st {
+  uint32_t category;
+  const wchar_t* name;
+} nvtxNameCategoryW_params;
+
+typedef struct nvtxNameOsThreadA_params_st {
+  uint32_t threadId;
+  const char* name;
+} nvtxNameOsThreadA_params;
+
+typedef struct nvtxNameOsThreadW_params_st {
+  uint32_t threadId;
+  const wchar_t* name;
+} nvtxNameOsThreadW_params;
+
+typedef struct nvtxNameCuDeviceA_params_st {
+  CUdevice device;
+  const char* name;
+} nvtxNameCuDeviceA_params;
+
+typedef struct nvtxNameCuDeviceW_params_st {
+  CUdevice device;
+  const wchar_t* name;
+} nvtxNameCuDeviceW_params;
+
+typedef struct nvtxNameCuContextA_params_st {
+  CUcontext context;
+  const char* name;
+} nvtxNameCuContextA_params;
+
+typedef struct nvtxNameCuContextW_params_st {
+  CUcontext context;
+  const wchar_t* name;
+} nvtxNameCuContextW_params;
+
+typedef struct nvtxNameCuStreamA_params_st {
+  CUstream stream;
+  const char* name;
+} nvtxNameCuStreamA_params;
+
+typedef struct nvtxNameCuStreamW_params_st {
+  CUstream stream;
+  const wchar_t* name;
+} nvtxNameCuStreamW_params;
+
+typedef struct nvtxNameCuEventA_params_st {
+  CUevent event;
+  const char* name;
+} nvtxNameCuEventA_params;
+
+typedef struct nvtxNameCuEventW_params_st {
+  CUevent event;
+  const wchar_t* name;
+} nvtxNameCuEventW_params;
+
+typedef struct nvtxNameCudaDeviceA_params_st {
+  int device;
+  const char* name;
+} nvtxNameCudaDeviceA_params;
+
+typedef struct nvtxNameCudaDeviceW_params_st {
+  int device;
+  const wchar_t* name;
+} nvtxNameCudaDeviceW_params;
+
+typedef struct nvtxNameCudaStreamA_params_st {
+  cudaStream_t stream;
+  const char* name;
+} nvtxNameCudaStreamA_params;
+
+typedef struct nvtxNameCudaStreamW_params_st {
+  cudaStream_t stream;
+  const wchar_t* name;
+} nvtxNameCudaStreamW_params;
+
+typedef struct nvtxNameCudaEventA_params_st {
+  cudaEvent_t event;
+  const char* name;
+} nvtxNameCudaEventA_params;
+
+typedef struct nvtxNameCudaEventW_params_st {
+  cudaEvent_t event;
+  const wchar_t* name;
+} nvtxNameCudaEventW_params;
+
+typedef struct nvtxDomainCreateA_params_st {
+  const char* name;
+} nvtxDomainCreateA_params;
+
+typedef struct nvtxDomainDestroy_params_st {
+  nvtxDomainHandle_t domain;
+} nvtxDomainDestroy_params;
+
+typedef struct nvtxDomainMarkEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxMarkEx_params core;
+} nvtxDomainMarkEx_params;
+
+typedef struct nvtxDomainRangeStartEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangeStartEx_params core;
+} nvtxDomainRangeStartEx_params;
+
+typedef struct nvtxDomainRangeEnd_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangeEnd_params core;
+} nvtxDomainRangeEnd_params;
+
+typedef struct nvtxDomainRangePushEx_params_st {
+  nvtxDomainHandle_t domain;
+  nvtxRangePushEx_params core;
+} nvtxDomainRangePushEx_params;
+
+typedef struct nvtxDomainRangePop_params_st {
+  nvtxDomainHandle_t domain;
+} nvtxDomainRangePop_params;
+
+typedef struct nvtxSyncUserCreate_params_st {
+  nvtxDomainHandle_t domain;
+  const nvtxSyncUserAttributes_t* attribs;
+} nvtxSyncUserCreate_params;
+
+typedef struct nvtxSyncUserCommon_params_st {
+  nvtxSyncUser_t handle;
+} nvtxSyncUserCommon_params;
+
+typedef struct nvtxDomainRegisterStringA_params_st {
+    nvtxDomainHandle_t domain;
+    const char* string;
+} nvtxDomainRegisterStringA_params;
+
+typedef struct nvtxDomainRegisterStringW_params_st {
+    nvtxDomainHandle_t domain;
+    const char* string;
+} nvtxDomainRegisterStringW_params;
+
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_common.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..731d6ad38d663ee8bc0c59d8c87ddaf9b105cec3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_common.h
@@ -0,0 +1,393 @@
+#ifndef NVPERF_COMMON_H
+#define NVPERF_COMMON_H
+
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_common.h
+ */
+
+#ifndef NVPERF_NVPA_STATUS_DEFINED
+#define NVPERF_NVPA_STATUS_DEFINED
+
+    /// Error codes.
+    typedef enum NVPA_Status
+    {
+        /// Success
+        NVPA_STATUS_SUCCESS = 0,
+        /// Generic error.
+        NVPA_STATUS_ERROR = 1,
+        /// Internal error.  Please file a bug!
+        NVPA_STATUS_INTERNAL_ERROR = 2,
+        /// NVPW_InitializeTarget() or NVPW_InitializeHost() has not been called yet.
+        NVPA_STATUS_NOT_INITIALIZED = 3,
+        /// The NvPerf DLL/DSO could not be loaded during NVPW_Initialize*(). Please ensure they are placed in the
+        /// appropriate location that can be founder by a dynamic linker. And on Linux systems, confirm that the
+        /// LD_LIBRARY_PATH environment variable is set correctly. Alternatively, you may utilize
+        /// NVPW_SetLibraryLoadPaths() to define additional library search paths.
+        NVPA_STATUS_NOT_LOADED = 4,
+        /// The function was not found in this version of the NvPerf DLL/DSO. Or if you are directly calling
+        /// NVPA_GetProcAddress(), please ensure the function name is spelled correctly.
+        NVPA_STATUS_FUNCTION_NOT_FOUND = 5,
+        /// The request was intentionally not supported.
+        NVPA_STATUS_NOT_SUPPORTED = 6,
+        /// The request was not implemented by this version.
+        NVPA_STATUS_NOT_IMPLEMENTED = 7,
+        /// Invalid argument.
+        NVPA_STATUS_INVALID_ARGUMENT = 8,
+        /// UNUSED
+        NVPA_STATUS_INVALID_METRIC_ID = 9,
+        /// No driver has been loaded via NVPW_*_LoadDriver().
+        NVPA_STATUS_DRIVER_NOT_LOADED = 10,
+        /// Failed memory allocation.
+        NVPA_STATUS_OUT_OF_MEMORY = 11,
+        /// UNUSED
+        NVPA_STATUS_INVALID_THREAD_STATE = 12,
+        /// UNUSED
+        NVPA_STATUS_FAILED_CONTEXT_ALLOC = 13,
+        /// The specified GPU is not supported. It is recommended to call IsGpuSupported() for more information
+        NVPA_STATUS_UNSUPPORTED_GPU = 14,
+        /// The installed NVIDIA driver is too old.
+        NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION = 15,
+        /// UNUSED
+        NVPA_STATUS_OBJECT_NOT_REGISTERED = 16,
+        /// Profiling permission not granted; see https://developer.nvidia.com/nvidia-development-tools-solutions-
+        /// ERR_NVGPUCTRPERM-permission-issue-performance-counters
+        NVPA_STATUS_INSUFFICIENT_PRIVILEGE = 17,
+        /// UNUSED
+        NVPA_STATUS_INVALID_CONTEXT_STATE = 18,
+        /// UNUSED
+        NVPA_STATUS_INVALID_OBJECT_STATE = 19,
+        /// The request could not be fulfilled because a system resource is already in use.
+        NVPA_STATUS_RESOURCE_UNAVAILABLE = 20,
+        /// UNUSED
+        NVPA_STATUS_DRIVER_LOADED_TOO_LATE = 21,
+        /// The provided buffer is not large enough.
+        NVPA_STATUS_INSUFFICIENT_SPACE = 22,
+        /// UNUSED
+        NVPA_STATUS_OBJECT_MISMATCH = 23,
+        /// Virtualized GPU (vGPU) is not supported.
+        NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED = 24,
+        /// Profiling permission was not granted or the device was disabled.
+        NVPA_STATUS_PROFILING_NOT_ALLOWED = 25,
+        NVPA_STATUS__COUNT
+    } NVPA_Status;
+
+
+    inline void NVPW_NVPAStatusToString(NVPA_Status status, const char** ppStatusStr, const char** ppCommentStr)
+    {
+        switch (status)
+        {
+            case NVPA_STATUS_SUCCESS:
+                *ppStatusStr = "NVPA_STATUS_SUCCESS";
+                *ppCommentStr = "Success";
+                return;
+            case NVPA_STATUS_ERROR:
+                *ppStatusStr = "NVPA_STATUS_ERROR";
+                *ppCommentStr = "Generic error.";
+                return;
+            case NVPA_STATUS_INTERNAL_ERROR:
+                *ppStatusStr = "NVPA_STATUS_INTERNAL_ERROR";
+                *ppCommentStr = "Internal error.  Please file a bug!";
+                return;
+            case NVPA_STATUS_NOT_INITIALIZED:
+                *ppStatusStr = "NVPA_STATUS_NOT_INITIALIZED";
+                *ppCommentStr = "NVPW_InitializeTarget() or NVPW_InitializeHost() has not been called yet.";
+                return;
+            case NVPA_STATUS_NOT_LOADED:
+                *ppStatusStr = "NVPA_STATUS_NOT_LOADED";
+                *ppCommentStr = "The NvPerf DLL/DSO could not be loaded during NVPW_Initialize*(). Please ensure they are placed in the appropriate location that can be founder by a dynamic linker. And on Linux systems, confirm that the LD_LIBRARY_PATH environment variable is set correctly. Alternatively, you may utilize NVPW_SetLibraryLoadPaths() to define additional library search paths.";
+                return;
+            case NVPA_STATUS_FUNCTION_NOT_FOUND:
+                *ppStatusStr = "NVPA_STATUS_FUNCTION_NOT_FOUND";
+                *ppCommentStr = "The function was not found in this version of the NvPerf DLL/DSO. Or if you are directly calling NVPA_GetProcAddress(), please ensure the function name is spelled correctly.";
+                return;
+            case NVPA_STATUS_NOT_SUPPORTED:
+                *ppStatusStr = "NVPA_STATUS_NOT_SUPPORTED";
+                *ppCommentStr = "The request was intentionally not supported.";
+                return;
+            case NVPA_STATUS_NOT_IMPLEMENTED:
+                *ppStatusStr = "NVPA_STATUS_NOT_IMPLEMENTED";
+                *ppCommentStr = "The request was not implemented by this version.";
+                return;
+            case NVPA_STATUS_INVALID_ARGUMENT:
+                *ppStatusStr = "NVPA_STATUS_INVALID_ARGUMENT";
+                *ppCommentStr = "Invalid argument.";
+                return;
+            case NVPA_STATUS_INVALID_METRIC_ID:
+                *ppStatusStr = "NVPA_STATUS_INVALID_METRIC_ID";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_DRIVER_NOT_LOADED:
+                *ppStatusStr = "NVPA_STATUS_DRIVER_NOT_LOADED";
+                *ppCommentStr = "No driver has been loaded via NVPW_*_LoadDriver().";
+                return;
+            case NVPA_STATUS_OUT_OF_MEMORY:
+                *ppStatusStr = "NVPA_STATUS_OUT_OF_MEMORY";
+                *ppCommentStr = "Failed memory allocation.";
+                return;
+            case NVPA_STATUS_INVALID_THREAD_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_THREAD_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_FAILED_CONTEXT_ALLOC:
+                *ppStatusStr = "NVPA_STATUS_FAILED_CONTEXT_ALLOC";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_UNSUPPORTED_GPU:
+                *ppStatusStr = "NVPA_STATUS_UNSUPPORTED_GPU";
+                *ppCommentStr = "The specified GPU is not supported. It is recommended to call IsGpuSupported() for more information";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_DRIVER_VERSION";
+                *ppCommentStr = "The installed NVIDIA driver is too old.";
+                return;
+            case NVPA_STATUS_OBJECT_NOT_REGISTERED:
+                *ppStatusStr = "NVPA_STATUS_OBJECT_NOT_REGISTERED";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_PRIVILEGE:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_PRIVILEGE";
+                *ppCommentStr = "Profiling permission not granted; see https://developer.nvidia.com/nvidia-development-tools-solutions-ERR_NVGPUCTRPERM-permission-issue-performance-counters";
+                return;
+            case NVPA_STATUS_INVALID_CONTEXT_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_CONTEXT_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INVALID_OBJECT_STATE:
+                *ppStatusStr = "NVPA_STATUS_INVALID_OBJECT_STATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_RESOURCE_UNAVAILABLE:
+                *ppStatusStr = "NVPA_STATUS_RESOURCE_UNAVAILABLE";
+                *ppCommentStr = "The request could not be fulfilled because a system resource is already in use.";
+                return;
+            case NVPA_STATUS_DRIVER_LOADED_TOO_LATE:
+                *ppStatusStr = "NVPA_STATUS_DRIVER_LOADED_TOO_LATE";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_INSUFFICIENT_SPACE:
+                *ppStatusStr = "NVPA_STATUS_INSUFFICIENT_SPACE";
+                *ppCommentStr = "The provided buffer is not large enough.";
+                return;
+            case NVPA_STATUS_OBJECT_MISMATCH:
+                *ppStatusStr = "NVPA_STATUS_OBJECT_MISMATCH";
+                *ppCommentStr = "UNUSED";
+                return;
+            case NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED:
+                *ppStatusStr = "NVPA_STATUS_VIRTUALIZED_DEVICE_NOT_SUPPORTED";
+                *ppCommentStr = "Virtualized GPU (vGPU) is not supported.";
+                return;
+            case NVPA_STATUS_PROFILING_NOT_ALLOWED:
+                *ppStatusStr = "NVPA_STATUS_PROFILING_NOT_ALLOWED";
+                *ppCommentStr = "Profiling permission was not granted or the device was disabled.";
+                return;
+            default:
+                *ppStatusStr = "Unrecognized status";
+                *ppCommentStr = "This status is unrecognized. Is it coming from a newer version of NvPerf library?";
+                return;
+        }
+    }
+
+
+#endif // NVPERF_NVPA_STATUS_DEFINED
+
+
+#ifndef NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+#define NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+
+    /// The configuration's activity-kind dictates which types of data may be collected.
+    typedef enum NVPA_ActivityKind
+    {
+        /// Invalid value.
+        NVPA_ACTIVITY_KIND_INVALID = 0,
+        /// A workload-centric activity for serialized and pipelined collection.
+        /// 
+        /// Profiler is capable of collecting both serialized and pipelined metrics.  The library introduces any
+        /// synchronization required to collect serialized metrics.
+        NVPA_ACTIVITY_KIND_PROFILER,
+        /// A realtime activity for sampling counters from the CPU or GPU.
+        NVPA_ACTIVITY_KIND_REALTIME_SAMPLED,
+        /// A realtime activity for profiling counters from the CPU or GPU without CPU/GPU synchronizations.
+        NVPA_ACTIVITY_KIND_REALTIME_PROFILER,
+        NVPA_ACTIVITY_KIND__COUNT
+    } NVPA_ActivityKind;
+
+
+#endif // NVPERF_NVPA_ACTIVITY_KIND_DEFINED
+
+
+#ifndef NVPERF_NVPA_BOOL_DEFINED
+#define NVPERF_NVPA_BOOL_DEFINED
+    /// The type used for boolean values.
+    typedef uint8_t NVPA_Bool;
+#endif // NVPERF_NVPA_BOOL_DEFINED
+
+#ifndef NVPA_STRUCT_SIZE
+#define NVPA_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif // NVPA_STRUCT_SIZE
+
+#ifndef NVPW_FIELD_EXISTS
+#define NVPW_FIELD_EXISTS(pParams_, name_) \
+    ((pParams_)->structSize >= (size_t)((const uint8_t*)(&(pParams_)->name_) + sizeof(pParams_)->name_ - (const uint8_t*)(pParams_)))
+#endif // NVPW_FIELD_EXISTS
+
+
+#ifndef NVPERF_NVPA_GETPROCADDRESS_DEFINED
+#define NVPERF_NVPA_GETPROCADDRESS_DEFINED
+
+typedef NVPA_Status (*NVPA_GenericFn)(void);
+
+
+    /// 
+    /// Gets the address of an NvPerf API function.
+    /// 
+    /// \return A function pointer to the function, or NULL if the function is not available.
+    /// 
+    /// \param pFunctionName [in] Name of the function to retrieve.
+    NVPA_GenericFn NVPA_GetProcAddress(const char* pFunctionName);
+
+#endif
+
+#ifndef NVPERF_NVPW_SETLIBRARYLOADPATHS_DEFINED
+#define NVPERF_NVPW_SETLIBRARYLOADPATHS_DEFINED
+
+
+    typedef struct NVPW_SetLibraryLoadPaths_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] number of paths in ppPaths
+        size_t numPaths;
+        /// [in] array of null-terminated paths
+        const char** ppPaths;
+    } NVPW_SetLibraryLoadPaths_Params;
+#define NVPW_SetLibraryLoadPaths_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_SetLibraryLoadPaths_Params, ppPaths)
+
+    /// Sets library search path for \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget().
+    /// \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget load the NvPerf DLL/DSO.  This function sets
+    /// ordered paths that will be searched with the LoadLibrary() or dlopen() call.
+    /// If load paths are set by this function, the default set of load paths
+    /// will not be attempted.
+    /// Each path must point at a directory (not a file name).
+    /// This function is not thread-safe.
+    /// Example Usage:
+    /// \code
+    ///     const char* paths[] = {
+    ///         "path1", "path2", etc
+    ///     };
+    ///     NVPW_SetLibraryLoadPaths_Params params{NVPW_SetLibraryLoadPaths_Params_STRUCT_SIZE};
+    ///     params.numPaths = sizeof(paths)/sizeof(paths[0]);
+    ///     params.ppPaths = paths;
+    ///     NVPW_SetLibraryLoadPaths(&params);
+    ///     NVPW_InitializeHost();
+    ///     params.numPaths = 0;
+    ///     params.ppPaths = NULL;
+    ///     NVPW_SetLibraryLoadPaths(&params);
+    /// \endcode
+    NVPA_Status NVPW_SetLibraryLoadPaths(NVPW_SetLibraryLoadPaths_Params* pParams);
+
+    typedef struct NVPW_SetLibraryLoadPathsW_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] number of paths in ppwPaths
+        size_t numPaths;
+        /// [in] array of null-terminated paths
+        const wchar_t** ppwPaths;
+    } NVPW_SetLibraryLoadPathsW_Params;
+#define NVPW_SetLibraryLoadPathsW_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_SetLibraryLoadPathsW_Params, ppwPaths)
+
+    /// Sets library search path for \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget().
+    /// \ref NVPW_InitializeHost() and \ref NVPW_InitializeTarget load the NvPerf DLL/DSO.  This function sets
+    /// ordered paths that will be searched with the LoadLibrary() or dlopen() call.
+    /// If load paths are set by this function, the default set of load paths
+    /// will not be attempted.
+    /// Each path must point at a directory (not a file name).
+    /// This function is not thread-safe.
+    /// Example Usage:
+    /// \code
+    ///     const wchar_t* wpaths[] = {
+    ///         L"path1", L"path2", etc
+    ///     };
+    ///     NVPW_SetLibraryLoadPathsW_Params params{NVPW_SetLibraryLoadPathsW_Params_STRUCT_SIZE};
+    ///     params.numPaths = sizeof(wpaths)/sizeof(wpaths[0]);
+    ///     params.ppwPaths = wpaths;
+    ///     NVPW_SetLibraryLoadPathsW(&params);
+    ///     NVPW_InitializeHost();
+    ///     params.numPaths = 0;
+    ///     params.ppwPaths = NULL;
+    ///     NVPW_SetLibraryLoadPathsW(&params);
+    /// \endcode
+    NVPA_Status NVPW_SetLibraryLoadPathsW(NVPW_SetLibraryLoadPathsW_Params* pParams);
+
+#endif
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_COMMON_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_cuda_host.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_cuda_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d6627ebe17073c3a085a75f0faa41cd0ab74daf
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_cuda_host.h
@@ -0,0 +1,197 @@
+#ifndef NVPERF_CUDA_HOST_H
+#define NVPERF_CUDA_HOST_H
+
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+#include "nvperf_host.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_cuda_host.h
+ */
+
+    /// 'NVPA_MetricsContext' and its APIs are deprecated, please use 'NVPW_MetricsEvaluator' and its APIs instead.
+    typedef struct NVPA_MetricsContext NVPA_MetricsContext;
+
+    typedef struct NVPW_CUDA_MetricsContext_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const char* pChipName;
+        /// [out]
+        struct NVPA_MetricsContext* pMetricsContext;
+    } NVPW_CUDA_MetricsContext_Create_Params;
+#define NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsContext_Create_Params, pMetricsContext)
+
+    NVPA_Status NVPW_CUDA_MetricsContext_Create(NVPW_CUDA_MetricsContext_Create_Params* pParams);
+
+    typedef struct NVPW_CUDA_RawMetricsConfig_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_ActivityKind activityKind;
+        /// [in]
+        const char* pChipName;
+        /// [out] new NVPA_RawMetricsConfig object
+        struct NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_CUDA_RawMetricsConfig_Create_Params;
+#define NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_CUDA_RawMetricsConfig_Create(NVPW_CUDA_RawMetricsConfig_Create_Params* pParams);
+
+    typedef struct NVPW_CUDA_RawMetricsConfig_Create_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_ActivityKind activityKind;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out] new NVPA_RawMetricsConfig object
+        struct NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_CUDA_RawMetricsConfig_Create_V2_Params;
+#define NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_V2_Params, pRawMetricsConfig)
+
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_RawMetricsConfig_Create_V2(NVPW_CUDA_RawMetricsConfig_Create_V2_Params* pParams);
+
+    typedef struct NVPW_CUDA_CounterDataBuilder_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out] new NVPA_CounterDataBuilder object
+        struct NVPA_CounterDataBuilder* pCounterDataBuilder;
+    } NVPW_CUDA_CounterDataBuilder_Create_Params;
+#define NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_CounterDataBuilder_Create_Params, pCounterDataBuilder)
+
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_CounterDataBuilder_Create(NVPW_CUDA_CounterDataBuilder_Create_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
+
+    typedef struct NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out]
+        size_t scratchBufferSize;
+    } NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params;
+#define NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params, scratchBufferSize)
+
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params* pParams);
+
+    typedef struct NVPW_CUDA_MetricsEvaluator_Initialize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pScratchBuffer;
+        /// [in] the size of the 'pScratchBuffer' array, should be at least the size of the 'scratchBufferSize' returned
+        /// by 'NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize'
+        size_t scratchBufferSize;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in] must be provided if 'pCounterDataImage' is not NULL
+        size_t counterDataImageSize;
+        /// [out]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+    } NVPW_CUDA_MetricsEvaluator_Initialize_Params;
+#define NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_Initialize_Params, pMetricsEvaluator)
+
+    /// Use one of 'pChipName', 'pCounterAvailabilityImage', or 'pCounterDataImage'. 'pChipName' or
+    /// 'pCounterAvailabilityImage' will create a metrics evaluator based on a virtual device while 'pCounterDataImage'
+    /// will create a metrics evaluator based on the actual device.
+    NVPA_Status NVPW_CUDA_MetricsEvaluator_Initialize(NVPW_CUDA_MetricsEvaluator_Initialize_Params* pParams);
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_CUDA_HOST_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_host.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_host.h
new file mode 100644
index 0000000000000000000000000000000000000000..db353378419ae1a92a4ed7eef1a4940ea55a90f4
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_host.h
@@ -0,0 +1,1578 @@
+#ifndef NVPERF_HOST_H
+#define NVPERF_HOST_H
+
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_host.h
+ */
+
+
+// Guard against multiple definition of NvPerf host types
+#ifndef NVPERF_HOST_API_DEFINED
+#define NVPERF_HOST_API_DEFINED
+
+
+/***************************************************************************//**
+ *  @name   Host Configuration
+ *  @{
+ */
+
+    typedef struct NVPW_InitializeHost_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeHost_Params;
+#define NVPW_InitializeHost_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeHost_Params, pPriv)
+
+    /// Load the host library.
+    NVPA_Status NVPW_InitializeHost(NVPW_InitializeHost_Params* pParams);
+
+    typedef struct NVPW_CounterData_CalculateCounterDataImageCopySize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The CounterDataPrefix generated from e.g.    nvperf2 initdata   or
+        /// NVPW_CounterDataBuilder_GetCounterDataPrefix().  Must be align(8).
+        const uint8_t* pCounterDataPrefix;
+        size_t counterDataPrefixSize;
+        /// max number of ranges that can be profiled
+        uint32_t maxNumRanges;
+        /// max number of RangeTree nodes; must be >= maxNumRanges
+        uint32_t maxNumRangeTreeNodes;
+        /// max string length of each RangeName, including the trailing NUL character
+        uint32_t maxRangeNameLength;
+        const uint8_t* pCounterDataSrc;
+        /// [out] required size of the copy buffer
+        size_t copyDataImageCounterSize;
+    } NVPW_CounterData_CalculateCounterDataImageCopySize_Params;
+#define NVPW_CounterData_CalculateCounterDataImageCopySize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_CalculateCounterDataImageCopySize_Params, copyDataImageCounterSize)
+
+    NVPA_Status NVPW_CounterData_CalculateCounterDataImageCopySize(NVPW_CounterData_CalculateCounterDataImageCopySize_Params* pParams);
+
+    typedef struct NVPW_CounterData_InitializeCounterDataImageCopy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The CounterDataPrefix generated from e.g.    nvperf2 initdata   or
+        /// NVPW_CounterDataBuilder_GetCounterDataPrefix().  Must be align(8).
+        const uint8_t* pCounterDataPrefix;
+        size_t counterDataPrefixSize;
+        /// max number of ranges that can be profiled
+        uint32_t maxNumRanges;
+        /// max number of RangeTree nodes; must be >= maxNumRanges
+        uint32_t maxNumRangeTreeNodes;
+        /// max string length of each RangeName, including the trailing NUL character
+        uint32_t maxRangeNameLength;
+        const uint8_t* pCounterDataSrc;
+        uint8_t* pCounterDataDst;
+    } NVPW_CounterData_InitializeCounterDataImageCopy_Params;
+#define NVPW_CounterData_InitializeCounterDataImageCopy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_InitializeCounterDataImageCopy_Params, pCounterDataDst)
+
+    NVPA_Status NVPW_CounterData_InitializeCounterDataImageCopy(NVPW_CounterData_InitializeCounterDataImageCopy_Params* pParams);
+
+    typedef struct NVPA_CounterDataCombiner NVPA_CounterDataCombiner;
+
+    typedef struct NVPW_CounterDataCombiner_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// The destination counter data into which the source datas will be combined
+        uint8_t* pCounterDataDst;
+        /// [out] The created counter data combiner
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+    } NVPW_CounterDataCombiner_Create_Params;
+#define NVPW_CounterDataCombiner_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_Create_Params, pCounterDataCombiner)
+
+    NVPA_Status NVPW_CounterDataCombiner_Create(NVPW_CounterDataCombiner_Create_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+    } NVPW_CounterDataCombiner_Destroy_Params;
+#define NVPW_CounterDataCombiner_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_Destroy_Params, pCounterDataCombiner)
+
+    NVPA_Status NVPW_CounterDataCombiner_Destroy(NVPW_CounterDataCombiner_Destroy_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_CreateRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t numDescriptions;
+        const char* const* ppDescriptions;
+        /// [out]
+        size_t rangeIndexDst;
+    } NVPW_CounterDataCombiner_CreateRange_Params;
+#define NVPW_CounterDataCombiner_CreateRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_CreateRange_Params, rangeIndexDst)
+
+    NVPA_Status NVPW_CounterDataCombiner_CreateRange(NVPW_CounterDataCombiner_CreateRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_CopyIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        /// [in]
+        size_t rangeIndexDst;
+        /// [in]
+        const uint8_t* pCounterDataSrc;
+        /// [in]
+        size_t rangeIndexSrc;
+    } NVPW_CounterDataCombiner_CopyIntoRange_Params;
+#define NVPW_CounterDataCombiner_CopyIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_CopyIntoRange_Params, rangeIndexSrc)
+
+    /// In order to use this API, the source counter data and the destination counter data must have identical counters
+    NVPA_Status NVPW_CounterDataCombiner_CopyIntoRange(NVPW_CounterDataCombiner_CopyIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_AccumulateIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        uint32_t dstMultiplier;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+        uint32_t srcMultiplier;
+    } NVPW_CounterDataCombiner_AccumulateIntoRange_Params;
+#define NVPW_CounterDataCombiner_AccumulateIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_AccumulateIntoRange_Params, srcMultiplier)
+
+    NVPA_Status NVPW_CounterDataCombiner_AccumulateIntoRange(NVPW_CounterDataCombiner_AccumulateIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_SumIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+    } NVPW_CounterDataCombiner_SumIntoRange_Params;
+#define NVPW_CounterDataCombiner_SumIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_SumIntoRange_Params, rangeIndexSrc)
+
+    NVPA_Status NVPW_CounterDataCombiner_SumIntoRange(NVPW_CounterDataCombiner_SumIntoRange_Params* pParams);
+
+    typedef struct NVPW_CounterDataCombiner_WeightedSumIntoRange_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataCombiner* pCounterDataCombiner;
+        size_t rangeIndexDst;
+        double dstMultiplier;
+        const uint8_t* pCounterDataSrc;
+        size_t rangeIndexSrc;
+        double srcMultiplier;
+    } NVPW_CounterDataCombiner_WeightedSumIntoRange_Params;
+#define NVPW_CounterDataCombiner_WeightedSumIntoRange_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataCombiner_WeightedSumIntoRange_Params, srcMultiplier)
+
+    NVPA_Status NVPW_CounterDataCombiner_WeightedSumIntoRange(NVPW_CounterDataCombiner_WeightedSumIntoRange_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   Metrics Configuration
+ *  @{
+ */
+
+    typedef struct NVPA_RawMetricsConfig NVPA_RawMetricsConfig;
+
+    typedef struct NVPA_RawMetricRequest
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// in
+        const char* pMetricName;
+        /// in
+        NVPA_Bool isolated;
+        /// in; ignored by AddMetric but observed by CounterData initialization
+        NVPA_Bool keepInstances;
+    } NVPA_RawMetricRequest;
+#define NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPA_RawMetricRequest, keepInstances)
+
+    typedef struct NVPW_GetSupportedChipNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [out]
+        const char* const* ppChipNames;
+        /// [out]
+        size_t numChipNames;
+    } NVPW_GetSupportedChipNames_Params;
+#define NVPW_GetSupportedChipNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetSupportedChipNames_Params, numChipNames)
+
+    NVPA_Status NVPW_GetSupportedChipNames(NVPW_GetSupportedChipNames_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_RawMetricsConfig_Destroy_Params;
+#define NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_Destroy_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_RawMetricsConfig_Destroy(NVPW_RawMetricsConfig_Destroy_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_SetCounterAvailability_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] buffer with counter availability image
+        const uint8_t* pCounterAvailabilityImage;
+    } NVPW_RawMetricsConfig_SetCounterAvailability_Params;
+#define NVPW_RawMetricsConfig_SetCounterAvailability_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_SetCounterAvailability_Params, pCounterAvailabilityImage)
+
+    NVPA_Status NVPW_RawMetricsConfig_SetCounterAvailability(NVPW_RawMetricsConfig_SetCounterAvailability_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_BeginPassGroup_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t maxPassCount;
+    } NVPW_RawMetricsConfig_BeginPassGroup_Params;
+#define NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_BeginPassGroup_Params, maxPassCount)
+
+    NVPA_Status NVPW_RawMetricsConfig_BeginPassGroup(NVPW_RawMetricsConfig_BeginPassGroup_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_EndPassGroup_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_RawMetricsConfig_EndPassGroup_Params;
+#define NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_EndPassGroup_Params, pRawMetricsConfig)
+
+    NVPA_Status NVPW_RawMetricsConfig_EndPassGroup(NVPW_RawMetricsConfig_EndPassGroup_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numMetrics;
+    } NVPW_RawMetricsConfig_GetNumMetrics_Params;
+#define NVPW_RawMetricsConfig_GetNumMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumMetrics_Params, numMetrics)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetNumMetrics(NVPW_RawMetricsConfig_GetNumMetrics_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t metricIndex;
+        /// [out]
+        const char* pMetricName;
+        /// [out]
+        NVPA_Bool supportsPipelined;
+        /// [out]
+        NVPA_Bool supportsIsolated;
+    } NVPW_RawMetricsConfig_GetMetricProperties_Params;
+#define NVPW_RawMetricsConfig_GetMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetMetricProperties_Params, supportsIsolated)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetMetricProperties(NVPW_RawMetricsConfig_GetMetricProperties_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetMetricProperties_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        size_t metricIndex;
+        /// [out]
+        const char* pMetricName;
+    } NVPW_RawMetricsConfig_GetMetricProperties_V2_Params;
+#define NVPW_RawMetricsConfig_GetMetricProperties_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetMetricProperties_V2_Params, pMetricName)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetMetricProperties_V2(NVPW_RawMetricsConfig_GetMetricProperties_V2_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_AddMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+    } NVPW_RawMetricsConfig_AddMetrics_Params;
+#define NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_AddMetrics_Params, numMetricRequests)
+
+    NVPA_Status NVPW_RawMetricsConfig_AddMetrics(NVPW_RawMetricsConfig_AddMetrics_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_IsAddMetricsPossible_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+        /// [out]
+        NVPA_Bool isPossible;
+    } NVPW_RawMetricsConfig_IsAddMetricsPossible_Params;
+#define NVPW_RawMetricsConfig_IsAddMetricsPossible_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params, isPossible)
+
+    NVPA_Status NVPW_RawMetricsConfig_IsAddMetricsPossible(NVPW_RawMetricsConfig_IsAddMetricsPossible_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GenerateConfigImage_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] If true, all existing pass groups may be merged to reduce number of passes.
+        /// If merge was successful, distribution of counters in passes may be updated as a side-effect. The effects
+        /// will be persistent in pRawMetricsConfig.
+        NVPA_Bool mergeAllPassGroups;
+    } NVPW_RawMetricsConfig_GenerateConfigImage_Params;
+#define NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GenerateConfigImage_Params, mergeAllPassGroups)
+
+    /// This API may fail if called inside a pass group with `mergeAllPassGroups` = true.
+    NVPA_Status NVPW_RawMetricsConfig_GenerateConfigImage(NVPW_RawMetricsConfig_GenerateConfigImage_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetConfigImage_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [in] Number of bytes allocated for pBuffer
+        size_t bytesAllocated;
+        /// [out] [optional] Buffer receiving the config image
+        uint8_t* pBuffer;
+        /// [out] Count of bytes that would be copied into pBuffer
+        size_t bytesCopied;
+    } NVPW_RawMetricsConfig_GetConfigImage_Params;
+#define NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetConfigImage_Params, bytesCopied)
+
+    NVPA_Status NVPW_RawMetricsConfig_GetConfigImage(NVPW_RawMetricsConfig_GetConfigImage_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_RawMetricsConfig_GetNumPasses_Params;
+#define NVPW_RawMetricsConfig_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumPasses_Params, numIsolatedPasses)
+
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_RawMetricsConfig_GetNumPasses(NVPW_RawMetricsConfig_GetNumPasses_Params* pParams);
+
+    typedef struct NVPW_RawMetricsConfig_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const NVPA_RawMetricsConfig* pRawMetricsConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_RawMetricsConfig_GetNumPasses_V2_Params;
+#define NVPW_RawMetricsConfig_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_RawMetricsConfig_GetNumPasses_V2_Params, numPasses)
+
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_RawMetricsConfig_GetNumPasses_V2(NVPW_RawMetricsConfig_GetNumPasses_V2_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] Typically created by e.g. NVPW_RawMetricsConfig_GetConfigImage(), must be align(8).
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [out]
+        size_t sampleSize;
+    } NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params;
+#define NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params, sampleSize)
+
+    /// Estimate per sample records size based on a virtual device
+    NVPA_Status NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize(NVPW_PeriodicSampler_Config_GetSocEstimatedSampleSize_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] Typically created by e.g. NVPW_RawMetricsConfig_GetConfigImage(), must be align(8).
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [out]
+        size_t sampleSize;
+    } NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params;
+#define NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params, sampleSize)
+
+    /// Estimate per sample records size based on a virtual device
+    NVPA_Status NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize(NVPW_PeriodicSampler_Config_GetGpuEstimatedSampleSize_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+    typedef struct NVPW_Config_GetRawCounterInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [in]
+        const char* pRawCounterName;
+        /// [inout] array containing indices of passes the counter resides in. 'pPassIndices' is in, '*pPassIndices' is
+        /// out.
+        size_t* pPassIndices;
+        /// [inout] if 'pPassIndices' is NULL, the count of passes this counter resides in will be returned; otherwise
+        /// it should be set to the capacity of 'pPassIndices' array, and on return, it will be overwritten to reflect
+        /// the actual count filled into 'pPassIndices'
+        size_t numPassIndices;
+    } NVPW_Config_GetRawCounterInfo_Params;
+#define NVPW_Config_GetRawCounterInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetRawCounterInfo_Params, numPassIndices)
+
+    NVPA_Status NVPW_Config_GetRawCounterInfo(NVPW_Config_GetRawCounterInfo_Params* pParams);
+
+    typedef struct NVPW_Config_GetRawCounters_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [in]
+        size_t configSize;
+        /// [in]
+        size_t passIndex;
+        /// [inout] array containing raw counter names. 'ppRawCounterNames' is in, '*ppRawCounterNames' is out.
+        const char** ppRawCounterNames;
+        /// [inout] if 'ppRawCounterNames' is NULL, the count of raw counters will be returned; otherwise it should be
+        /// set to the capacity of 'ppRawCounterNames' array, and on return, it will be overwritten to reflect the
+        /// actual count filled into 'ppRawCounterNames'
+        size_t numRawCounters;
+    } NVPW_Config_GetRawCounters_Params;
+#define NVPW_Config_GetRawCounters_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetRawCounters_Params, numRawCounters)
+
+    NVPA_Status NVPW_Config_GetRawCounters(NVPW_Config_GetRawCounters_Params* pParams);
+
+/***************************************************************************//**
+ *  @name   CounterData Creation
+ *  @{
+ */
+
+    typedef struct NVPA_CounterDataBuilder NVPA_CounterDataBuilder;
+
+    typedef struct NVPW_CounterDataBuilder_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [out]
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        const char* pChipName;
+    } NVPW_CounterDataBuilder_Create_Params;
+#define NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_Create_Params, pChipName)
+
+    NVPA_Status NVPW_CounterDataBuilder_Create(NVPW_CounterDataBuilder_Create_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+    } NVPW_CounterDataBuilder_Destroy_Params;
+#define NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_Destroy_Params, pCounterDataBuilder)
+
+    NVPA_Status NVPW_CounterDataBuilder_Destroy(NVPW_CounterDataBuilder_Destroy_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_AddMetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        const NVPA_RawMetricRequest* pRawMetricRequests;
+        size_t numMetricRequests;
+    } NVPW_CounterDataBuilder_AddMetrics_Params;
+#define NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_AddMetrics_Params, numMetricRequests)
+
+    NVPA_Status NVPW_CounterDataBuilder_AddMetrics(NVPW_CounterDataBuilder_AddMetrics_Params* pParams);
+
+    typedef struct NVPW_CounterDataBuilder_GetCounterDataPrefix_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_CounterDataBuilder* pCounterDataBuilder;
+        /// [in] Number of bytes allocated for pBuffer
+        size_t bytesAllocated;
+        /// [out] [optional] Buffer receiving the counter data prefix
+        uint8_t* pBuffer;
+        /// [out] Count of bytes that would be copied to pBuffer
+        size_t bytesCopied;
+    } NVPW_CounterDataBuilder_GetCounterDataPrefix_Params;
+#define NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params, bytesCopied)
+
+    NVPA_Status NVPW_CounterDataBuilder_GetCounterDataPrefix(NVPW_CounterDataBuilder_GetCounterDataPrefix_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   MetricsContext - metric configuration and evaluation
+ *  @{
+ */
+
+    /// 'NVPA_MetricsContext' and its APIs are deprecated, please use 'NVPW_MetricsEvaluator' and its APIs instead.
+    typedef struct NVPA_MetricsContext NVPA_MetricsContext;
+
+    typedef enum NVPA_MetricDetailLevel
+    {
+        NVPA_METRIC_DETAIL_LEVEL_INVALID,
+        NVPA_METRIC_DETAIL_LEVEL_GPU,
+        NVPA_METRIC_DETAIL_LEVEL_ALL,
+        NVPA_METRIC_DETAIL_LEVEL_GPU_AND_LEAF_INSTANCES,
+        NVPA_METRIC_DETAIL_LEVEL__COUNT
+    } NVPA_MetricDetailLevel;
+
+    typedef struct NVPW_MetricsContext_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_Destroy_Params;
+#define NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_Destroy_Params, pMetricsContext)
+
+    NVPA_Status NVPW_MetricsContext_Destroy(NVPW_MetricsContext_Destroy_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_RunScript_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// in : if true, upon error, calls PyErr_Print() which causes exceptions to be logged to stderr
+        NVPA_Bool printErrors;
+        /// in : the script source code
+        const char* pSource;
+        /// in : the filename reported in stack traces; if NULL, uses an auto-generated name
+        const char* pFileName;
+    } NVPW_MetricsContext_RunScript_Params;
+#define NVPW_MetricsContext_RunScript_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_RunScript_Params, pFileName)
+
+    /// Runs code in the metrics module.  Additional metrics can be added through this interface.
+    /// If printErrors is true, calls PyErr_Print() which causes exceptions to be logged to stderr.
+    /// Equivalent to:
+    ///      exec(source, metrics.__dict__, metrics.__dict__)
+    NVPA_Status NVPW_MetricsContext_RunScript(NVPW_MetricsContext_RunScript_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_ExecScript_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// in : if true, treats pSource as a statement to be eval'd; otherwise, calls exec.
+        NVPA_Bool isStatement;
+        /// in : if true, upon error, calls PyErr_Print() which causes exceptions to be logged to stderr
+        NVPA_Bool printErrors;
+        /// in : the script source code
+        const char* pSource;
+        /// in : the filename reported in stack traces; if NULL, uses an auto-generated name
+        const char* pFileName;
+        /// out: if isStatement, points at a string form of the evaluation; if !isStatement, points at
+        /// str(locals()['result'])
+        const char* pResultStr;
+    } NVPW_MetricsContext_ExecScript_Begin_Params;
+#define NVPW_MetricsContext_ExecScript_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_ExecScript_Begin_Params, pResultStr)
+
+    /// Executes a script in the metrics module, but does not modify its contents (for ordinary queries).
+    /// Equivalent to one of:
+    ///      eval(source, metrics.__dict__, {})            # isStatement true
+    ///      exec(source, metrics.__dict__, {})            # isStatement false
+    NVPA_Status NVPW_MetricsContext_ExecScript_Begin(NVPW_MetricsContext_ExecScript_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_ExecScript_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_ExecScript_End_Params;
+#define NVPW_MetricsContext_ExecScript_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_ExecScript_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_ExecScript_Begin.
+    NVPA_Status NVPW_MetricsContext_ExecScript_End(NVPW_MetricsContext_ExecScript_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetCounterNames_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// [out]
+        size_t numCounters;
+        /// [out]
+        const char* const* ppCounterNames;
+    } NVPW_MetricsContext_GetCounterNames_Begin_Params;
+#define NVPW_MetricsContext_GetCounterNames_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetCounterNames_Begin_Params, ppCounterNames)
+
+    /// Outputs (size, pointer) to an array of "const char* pCounterName".  The lifetime of the array is tied to
+    /// MetricsContext.  The names are sorted.
+    /// Impl: lazily creates list
+    NVPA_Status NVPW_MetricsContext_GetCounterNames_Begin(NVPW_MetricsContext_GetCounterNames_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetCounterNames_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetCounterNames_End_Params;
+#define NVPW_MetricsContext_GetCounterNames_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetCounterNames_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetCounterNames_Begin.
+    NVPA_Status NVPW_MetricsContext_GetCounterNames_End(NVPW_MetricsContext_GetCounterNames_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetThroughputNames_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// [out]
+        size_t numThroughputs;
+        /// [out]
+        const char* const* ppThroughputNames;
+    } NVPW_MetricsContext_GetThroughputNames_Begin_Params;
+#define NVPW_MetricsContext_GetThroughputNames_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetThroughputNames_Begin_Params, ppThroughputNames)
+
+    /// Outputs (size, pointer) to an array of "const char* pThroughputName".  The lifetime of the array is tied to
+    /// MetricsContext.  The names are sorted.
+    /// Impl: lazily creates list
+    NVPA_Status NVPW_MetricsContext_GetThroughputNames_Begin(NVPW_MetricsContext_GetThroughputNames_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetThroughputNames_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetThroughputNames_End_Params;
+#define NVPW_MetricsContext_GetThroughputNames_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetThroughputNames_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetThroughputNames_Begin.
+    NVPA_Status NVPW_MetricsContext_GetThroughputNames_End(NVPW_MetricsContext_GetThroughputNames_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetRatioNames_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// [out]
+        size_t numRatios;
+        /// [out]
+        const char* const* ppRatioNames;
+    } NVPW_MetricsContext_GetRatioNames_Begin_Params;
+#define NVPW_MetricsContext_GetRatioNames_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetRatioNames_Begin_Params, ppRatioNames)
+
+    /// Outputs (size, pointer) to an array of "const char* pRatioName".  The lifetime of the array is tied to
+    /// MetricsContext.  The names are sorted.
+    /// Impl: lazily creates list
+    NVPA_Status NVPW_MetricsContext_GetRatioNames_Begin(NVPW_MetricsContext_GetRatioNames_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetRatioNames_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetRatioNames_End_Params;
+#define NVPW_MetricsContext_GetRatioNames_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetRatioNames_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetCounterNames_Begin.
+    NVPA_Status NVPW_MetricsContext_GetRatioNames_End(NVPW_MetricsContext_GetRatioNames_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricNames_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// out: number of elements in array ppMetricNames
+        size_t numMetrics;
+        /// out: pointer to array of 'const char* pMetricName'
+        const char* const* ppMetricNames;
+        /// in : if true, doesn't enumerate \<metric\>.peak_{burst, sustained}
+        NVPA_Bool hidePeakSubMetrics;
+        /// in : if true, doesn't enumerate \<metric\>.per_{active,elapsed,region,frame}_cycle
+        NVPA_Bool hidePerCycleSubMetrics;
+        /// in : if true, doesn't enumerate \<metric\>.pct_of_peak_{burst,sustained}_{active,elapsed,region,frame}
+        NVPA_Bool hidePctOfPeakSubMetrics;
+        /// in : if false, enumerate \<unit\>__throughput.pct_of_peak_sustained_elapsed even if hidePctOfPeakSubMetrics
+        /// is true
+        NVPA_Bool hidePctOfPeakSubMetricsOnThroughputs;
+    } NVPW_MetricsContext_GetMetricNames_Begin_Params;
+#define NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricNames_Begin_Params, hidePctOfPeakSubMetricsOnThroughputs)
+
+    /// Outputs (size, pointer) to an array of "const char* pMetricName".  The lifetime of the array is tied to
+    /// MetricsContext.  The names are sorted.
+    /// Enumerates all metrics at all levels.  Includes:
+    ///  *   counter.{sum,avg,min,max}
+    ///  *   throughput.{avg,min,max}
+    ///  *   \<metric\>.peak_{burst, sustained}
+    ///  *   \<metric\>.per_{active,elapsed,region,frame}_cycle
+    ///  *   \<metric\>.pct_of_peak_{burst,sustained}_{active,elapsed,region,frame}
+    ///  *   \<metric\>.per.{other, other_pct}
+    NVPA_Status NVPW_MetricsContext_GetMetricNames_Begin(NVPW_MetricsContext_GetMetricNames_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricNames_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetMetricNames_End_Params;
+#define NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricNames_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetMetricNames_Begin.
+    NVPA_Status NVPW_MetricsContext_GetMetricNames_End(NVPW_MetricsContext_GetMetricNames_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetThroughputBreakdown_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        const char* pThroughputName;
+        const char* const* ppCounterNames;
+        const char* const* ppSubThroughputNames;
+    } NVPW_MetricsContext_GetThroughputBreakdown_Begin_Params;
+#define NVPW_MetricsContext_GetThroughputBreakdown_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetThroughputBreakdown_Begin_Params, ppSubThroughputNames)
+
+    /// After this function returns, the lifetimes of strings pointed to by {ppCounterNames, ppSubThroughputNames,
+    /// ppSubMetricNames} are guaranteed until NVPW_MetricsContext_GetThroughputBreakdown_End, or until pMetricsContext
+    /// is destroyed
+    NVPA_Status NVPW_MetricsContext_GetThroughputBreakdown_Begin(NVPW_MetricsContext_GetThroughputBreakdown_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetThroughputBreakdown_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetThroughputBreakdown_End_Params;
+#define NVPW_MetricsContext_GetThroughputBreakdown_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetThroughputBreakdown_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetThroughputBreakdown_Begin.
+    NVPA_Status NVPW_MetricsContext_GetThroughputBreakdown_End(NVPW_MetricsContext_GetThroughputBreakdown_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricProperties_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        const char* pMetricName;
+        /// out
+        const char* pDescription;
+        /// out
+        const char* pDimUnits;
+        /// out: a NULL-terminated array of pointers to RawMetric names that can be passed to
+        /// NVPW_RawMetricsConfig_AddMetrics()
+        const char** ppRawMetricDependencies;
+        /// out: metric.peak_burst.value.gpu
+        double gpuBurstRate;
+        /// out: metric.peak_sustained.value.gpu
+        double gpuSustainedRate;
+        /// out: a NULL-terminated array of pointers to RawMetric names that can be passed to
+        /// NVPW_RawMetricsConfig_AddMetrics().
+        const char** ppOptionalRawMetricDependencies;
+    } NVPW_MetricsContext_GetMetricProperties_Begin_Params;
+#define NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricProperties_Begin_Params, ppOptionalRawMetricDependencies)
+
+    /// After this function returns, the lifetimes of strings pointed to by pMetricProperties or
+    /// ppOptionalRawMetricDependencies are guaranteed until NVPW_MetricsContext_GetMetricProperties_End, or until
+    /// pMetricsContext is destroyed.
+    NVPA_Status NVPW_MetricsContext_GetMetricProperties_Begin(NVPW_MetricsContext_GetMetricProperties_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricProperties_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetMetricProperties_End_Params;
+#define NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricProperties_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetMetricProperties_Begin.
+    NVPA_Status NVPW_MetricsContext_GetMetricProperties_End(NVPW_MetricsContext_GetMetricProperties_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_SetCounterData_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        NVPA_Bool isolated;
+    } NVPW_MetricsContext_SetCounterData_Params;
+#define NVPW_MetricsContext_SetCounterData_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_SetCounterData_Params, isolated)
+
+    /// Sets data for subsequent evaluation calls.
+    /// Only one (CounterData, range, isolated) set of counters can be active at a time; subsequent calls will overwrite
+    /// previous calls' data.
+    NVPA_Status NVPW_MetricsContext_SetCounterData(NVPW_MetricsContext_SetCounterData_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_SetUserData_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// duration in ns of user defined frame
+        double frameDuration;
+        /// duration in ns of user defined region
+        double regionDuration;
+    } NVPW_MetricsContext_SetUserData_Params;
+#define NVPW_MetricsContext_SetUserData_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_SetUserData_Params, regionDuration)
+
+    /// Sets user data for subsequent evaluation calls.
+    NVPA_Status NVPW_MetricsContext_SetUserData(NVPW_MetricsContext_SetUserData_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_EvaluateToGpuValues_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        size_t numMetrics;
+        const char* const* ppMetricNames;
+        /// [out]
+        double* pMetricValues;
+    } NVPW_MetricsContext_EvaluateToGpuValues_Params;
+#define NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_EvaluateToGpuValues_Params, pMetricValues)
+
+    /// Evaluate multiple metrics to retrieve their GPU values.
+    NVPA_Status NVPW_MetricsContext_EvaluateToGpuValues(NVPW_MetricsContext_EvaluateToGpuValues_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricSuffix_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// in: pointer to the metric name
+        const char* pMetricName;
+        /// out: number of elements in array ppSuffixes
+        size_t numSuffixes;
+        /// out: pointer to array of 'const char* pSuffixes'
+        const char* const* ppSuffixes;
+        /// in : if true, doesn't enumerate \<metric\>.peak_{burst, sustained}
+        NVPA_Bool hidePeakSubMetrics;
+        /// in : if true, doesn't enumerate \<metric\>.per_{active,elapsed,region,frame}_cycle
+        NVPA_Bool hidePerCycleSubMetrics;
+        /// in : if true, doesn't enumerate \<metric\>.pct_of_peak_{burst,sustained}_{active,elapsed,region,frame}
+        NVPA_Bool hidePctOfPeakSubMetrics;
+        /// in : if false, enumerate \<unit\>__throughput.pct_of_peak_sustained_elapsed even if hidePctOfPeakSubMetrics
+        /// is true
+        NVPA_Bool hidePctOfPeakSubMetricsOnThroughputs;
+    } NVPW_MetricsContext_GetMetricSuffix_Begin_Params;
+#define NVPW_MetricsContext_GetMetricSuffix_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricSuffix_Begin_Params, hidePctOfPeakSubMetricsOnThroughputs)
+
+    /// Outputs (size, pointer) to an array of "const char* pSuffixes".  The lifetime of the array is tied to
+    /// MetricsContext.
+    /// return all the suffixes the metric has.  the possible suffixes include:
+    ///  *   counter.{sum,avg,min,max}
+    ///  *   throughput.{avg,min,max}
+    ///  *   \<metric\>.peak_{burst, sustained}
+    ///  *   \<metric\>.per_{active,elapsed,region,frame}_cycle
+    ///  *   \<metric\>.pct_of_peak_{burst,sustained}_{active,elapsed,region,frame}
+    ///  *   \<metric\>.per.{other, other_pct}
+    NVPA_Status NVPW_MetricsContext_GetMetricSuffix_Begin(NVPW_MetricsContext_GetMetricSuffix_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricSuffix_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetMetricSuffix_End_Params;
+#define NVPW_MetricsContext_GetMetricSuffix_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricSuffix_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetMetricSuffix_Begin.
+    NVPA_Status NVPW_MetricsContext_GetMetricSuffix_End(NVPW_MetricsContext_GetMetricSuffix_End_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricBaseNames_Begin_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+        /// out: number of elements in array pMetricsBaseNames
+        size_t numMetricBaseNames;
+        /// out: pointer to array of 'const char* pMetricsBaseName'
+        const char* const* ppMetricBaseNames;
+    } NVPW_MetricsContext_GetMetricBaseNames_Begin_Params;
+#define NVPW_MetricsContext_GetMetricBaseNames_Begin_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricBaseNames_Begin_Params, ppMetricBaseNames)
+
+    /// Outputs (size, pointer) to an array of "const char* ppMetricBaseNames".  The lifetime of the array is tied to
+    /// MetricsContext.
+    /// return all the metric base names.
+    NVPA_Status NVPW_MetricsContext_GetMetricBaseNames_Begin(NVPW_MetricsContext_GetMetricBaseNames_Begin_Params* pParams);
+
+    typedef struct NVPW_MetricsContext_GetMetricBaseNames_End_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        NVPA_MetricsContext* pMetricsContext;
+    } NVPW_MetricsContext_GetMetricBaseNames_End_Params;
+#define NVPW_MetricsContext_GetMetricBaseNames_End_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsContext_GetMetricBaseNames_End_Params, pMetricsContext)
+
+    /// Cleans up memory internally allocated by NVPW_MetricsContext_GetMetricBaseNames_Begin.
+    NVPA_Status NVPW_MetricsContext_GetMetricBaseNames_End(NVPW_MetricsContext_GetMetricBaseNames_End_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+/***************************************************************************//**
+ *  @name   Metrics Evaluator
+ *  @{
+ */
+
+    typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
+
+#ifndef NVPW_DIM_UNIT_DEFINED
+#define NVPW_DIM_UNIT_DEFINED
+    typedef enum NVPW_DimUnitName
+    {
+        NVPW_DIM_UNIT_INVALID = 3518299157,
+        NVPW_DIM_UNIT_UNITLESS = 2126137902,
+        NVPW_DIM_UNIT_ATTRIBUTES = 3776338729,
+        NVPW_DIM_UNIT_BYTES = 3797850191,
+        NVPW_DIM_UNIT_CTAS = 1960564139,
+        NVPW_DIM_UNIT_CTC_CYCLES = 2224883873,
+        NVPW_DIM_UNIT_DRAM_CYCLES = 2650981327,
+        NVPW_DIM_UNIT_FBP_CYCLES = 1785238957,
+        NVPW_DIM_UNIT_FE_OPS = 2919159083,
+        NVPW_DIM_UNIT_GPC_CYCLES = 1222631184,
+        NVPW_DIM_UNIT_IDC_REQUESTS = 2012649669,
+        NVPW_DIM_UNIT_INSTRUCTIONS = 1418625543,
+        NVPW_DIM_UNIT_KILOBYTES = 1335980302,
+        NVPW_DIM_UNIT_L1DATA_BANK_ACCESSES = 1479493682,
+        NVPW_DIM_UNIT_L1DATA_BANK_CONFLICTS = 3433170787,
+        NVPW_DIM_UNIT_L1TEX_REQUESTS = 1306473767,
+        NVPW_DIM_UNIT_L1TEX_TAGS = 26573010,
+        NVPW_DIM_UNIT_L1TEX_WAVEFRONTS = 129373765,
+        NVPW_DIM_UNIT_L2_REQUESTS = 1143695106,
+        NVPW_DIM_UNIT_L2_SECTORS = 3424101564,
+        NVPW_DIM_UNIT_L2_TAGS = 3755612781,
+        NVPW_DIM_UNIT_MCC_CYCLES = 1826685787,
+        NVPW_DIM_UNIT_NANOSECONDS = 3047500672,
+        NVPW_DIM_UNIT_NVLRX_CYCLES = 4059934930,
+        NVPW_DIM_UNIT_NVLTX_CYCLES = 1814350488,
+        NVPW_DIM_UNIT_PCIE_CYCLES = 1230450943,
+        NVPW_DIM_UNIT_PERCENT = 1284354694,
+        NVPW_DIM_UNIT_PIXELS = 4227616663,
+        NVPW_DIM_UNIT_PIXEL_SHADER_BARRIERS = 3705502518,
+        NVPW_DIM_UNIT_PRIMITIVES = 2373084002,
+        NVPW_DIM_UNIT_QUADS = 1539753497,
+        NVPW_DIM_UNIT_REGISTERS = 2837260947,
+        NVPW_DIM_UNIT_SAMPLES = 746046551,
+        NVPW_DIM_UNIT_SECONDS = 1164825258,
+        NVPW_DIM_UNIT_SYS_CYCLES = 3310821688,
+        NVPW_DIM_UNIT_TEXELS = 1293214069,
+        NVPW_DIM_UNIT_THREADS = 164261907,
+        NVPW_DIM_UNIT_VERTICES = 1873662209,
+        NVPW_DIM_UNIT_WARPS = 97951949,
+        NVPW_DIM_UNIT_WORKLOADS = 1728142656
+    } NVPW_DimUnitName;
+#endif //NVPW_DIM_UNIT_DEFINED
+
+#ifndef NVPW_HW_UNIT_DEFINED
+#define NVPW_HW_UNIT_DEFINED
+    typedef enum NVPW_HwUnit
+    {
+        NVPW_HW_UNIT_INVALID = 3498035701,
+        NVPW_HW_UNIT_CROP = 2872137846,
+        NVPW_HW_UNIT_DRAM = 1662616918,
+        NVPW_HW_UNIT_DRAMC = 1401232876,
+        NVPW_HW_UNIT_FBP = 2947194306,
+        NVPW_HW_UNIT_FBPA = 690045803,
+        NVPW_HW_UNIT_FE = 2204924321,
+        NVPW_HW_UNIT_GPC = 1911735839,
+        NVPW_HW_UNIT_GPU = 1014363534,
+        NVPW_HW_UNIT_GR = 2933618517,
+        NVPW_HW_UNIT_IDC = 842765289,
+        NVPW_HW_UNIT_L1TEX = 893940957,
+        NVPW_HW_UNIT_LTS = 2333266697,
+        NVPW_HW_UNIT_MCC = 3980130194,
+        NVPW_HW_UNIT_NVLRX = 3091684901,
+        NVPW_HW_UNIT_NVLTX = 869679659,
+        NVPW_HW_UNIT_PCIE = 3433264174,
+        NVPW_HW_UNIT_PDA = 345193251,
+        NVPW_HW_UNIT_PES = 804128425,
+        NVPW_HW_UNIT_PROP = 3339255507,
+        NVPW_HW_UNIT_RASTER = 187932504,
+        NVPW_HW_UNIT_SM = 724224710,
+        NVPW_HW_UNIT_SMSP = 2837616917,
+        NVPW_HW_UNIT_SYS = 768990063,
+        NVPW_HW_UNIT_TPC = 1889024613,
+        NVPW_HW_UNIT_VAF = 753670509,
+        NVPW_HW_UNIT_VPC = 275561583,
+        NVPW_HW_UNIT_ZROP = 979500456
+    } NVPW_HwUnit;
+#endif //NVPW_HW_UNIT_DEFINED
+
+    typedef enum NVPW_RollupOp
+    {
+        NVPW_ROLLUP_OP_AVG = 0,
+        NVPW_ROLLUP_OP_MAX,
+        NVPW_ROLLUP_OP_MIN,
+        NVPW_ROLLUP_OP_SUM,
+        NVPW_ROLLUP_OP__COUNT
+    } NVPW_RollupOp;
+
+    typedef enum NVPW_MetricType
+    {
+        NVPW_METRIC_TYPE_COUNTER = 0,
+        NVPW_METRIC_TYPE_RATIO,
+        NVPW_METRIC_TYPE_THROUGHPUT,
+        NVPW_METRIC_TYPE__COUNT
+    } NVPW_MetricType;
+
+    typedef enum NVPW_Submetric
+    {
+        NVPW_SUBMETRIC_NONE = 0,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED = 1,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ACTIVE = 2,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ACTIVE_PER_SECOND = 3,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ELAPSED = 4,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_ELAPSED_PER_SECOND = 5,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_FRAME = 6,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_FRAME_PER_SECOND = 7,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_REGION = 8,
+        NVPW_SUBMETRIC_PEAK_SUSTAINED_REGION_PER_SECOND = 9,
+        NVPW_SUBMETRIC_PER_CYCLE_ACTIVE = 10,
+        NVPW_SUBMETRIC_PER_CYCLE_ELAPSED = 11,
+        NVPW_SUBMETRIC_PER_CYCLE_IN_FRAME = 12,
+        NVPW_SUBMETRIC_PER_CYCLE_IN_REGION = 13,
+        NVPW_SUBMETRIC_PER_SECOND = 14,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_ACTIVE = 15,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_ELAPSED = 16,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_FRAME = 17,
+        NVPW_SUBMETRIC_PCT_OF_PEAK_SUSTAINED_REGION = 18,
+        NVPW_SUBMETRIC_MAX_RATE = 19,
+        NVPW_SUBMETRIC_PCT = 20,
+        NVPW_SUBMETRIC_RATIO = 21,
+        NVPW_SUBMETRIC__COUNT
+    } NVPW_Submetric;
+
+    typedef struct NVPW_MetricEvalRequest
+    {
+        /// the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t metricIndex;
+        /// one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// one of 'NVPW_RollupOp', required for Counter and Throughput, doesn't apply to Ratio
+        uint8_t rollupOp;
+        /// one of 'NVPW_Submetric', required for Ratio and Throughput, optional for Counter
+        uint16_t submetric;
+    } NVPW_MetricEvalRequest;
+#define NVPW_MetricEvalRequest_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricEvalRequest, submetric)
+
+    typedef struct NVPW_DimUnitFactor
+    {
+        /// one of 'NVPW_DimUnitName'
+        uint32_t dimUnit;
+        int8_t exponent;
+    } NVPW_DimUnitFactor;
+#define NVPW_DimUnitFactor_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_DimUnitFactor, exponent)
+
+    typedef struct NVPW_MetricsEvaluator_Destroy_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+    } NVPW_MetricsEvaluator_Destroy_Params;
+#define NVPW_MetricsEvaluator_Destroy_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_Destroy_Params, pMetricsEvaluator)
+
+    NVPA_Status NVPW_MetricsEvaluator_Destroy(NVPW_MetricsEvaluator_Destroy_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out]
+        const char* pMetricNames;
+        /// [out]
+        const size_t* pMetricNameBeginIndices;
+        /// [out]
+        size_t numMetrics;
+    } NVPW_MetricsEvaluator_GetMetricNames_Params;
+#define NVPW_MetricsEvaluator_GetMetricNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricNames_Params, numMetrics)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricNames(NVPW_MetricsEvaluator_GetMetricNames_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] can be either a base metric or a metric
+        const char* pMetricName;
+        /// [out] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t metricIndex;
+    } NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params;
+#define NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params, metricIndex)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricTypeAndIndex(NVPW_MetricsEvaluator_GetMetricTypeAndIndex_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const char* pMetricName;
+        /// [inout] 'pMetricEvalRequest' is in, '*pMetricEvalRequest' is out
+        struct NVPW_MetricEvalRequest* pMetricEvalRequest;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+    } NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params;
+#define NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params, metricEvalRequestStructSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest(NVPW_MetricsEvaluator_ConvertMetricNameToMetricEvalRequest_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_HwUnitToString_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_HwUnit'
+        uint32_t hwUnit;
+        /// [out]
+        const char* pHwUnitName;
+    } NVPW_MetricsEvaluator_HwUnitToString_Params;
+#define NVPW_MetricsEvaluator_HwUnitToString_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_HwUnitToString_Params, pHwUnitName)
+
+    NVPA_Status NVPW_MetricsEvaluator_HwUnitToString(NVPW_MetricsEvaluator_HwUnitToString_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetCounterProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t counterIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out] one of 'NVPW_HwUnit'
+        uint32_t hwUnit;
+    } NVPW_MetricsEvaluator_GetCounterProperties_Params;
+#define NVPW_MetricsEvaluator_GetCounterProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetCounterProperties_Params, hwUnit)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetCounterProperties(NVPW_MetricsEvaluator_GetCounterProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetRatioMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t ratioMetricIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out]
+        uint64_t hwUnit;
+    } NVPW_MetricsEvaluator_GetRatioMetricProperties_Params;
+#define NVPW_MetricsEvaluator_GetRatioMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetRatioMetricProperties_Params, hwUnit)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetRatioMetricProperties(NVPW_MetricsEvaluator_GetRatioMetricProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] the metric index as in 'NVPW_MetricsEvaluator_GetMetricNames'
+        size_t throughputMetricIndex;
+        /// [out]
+        const char* pDescription;
+        /// [out]
+        uint32_t hwUnit;
+        /// [out] number of constituent counters for the throughput metric
+        size_t numCounters;
+        /// [out] metric indices as in 'NVPW_MetricsEvaluator_GetMetricNames', valid if 'numCounters' > 0, otherwise
+        /// returned as nullptr
+        const size_t* pCounterIndices;
+        /// [out] number of constituent sub-throughputs for the throughput metric
+        size_t numSubThroughputs;
+        /// [out] metric indices as in 'NVPW_MetricsEvaluator_GetMetricNames', valid if 'numSubThroughputs' > 0,
+        /// otherwise returned as nullptr
+        const size_t* pSubThroughputIndices;
+    } NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params;
+#define NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params, pSubThroughputIndices)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetThroughputMetricProperties(NVPW_MetricsEvaluator_GetThroughputMetricProperties_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_MetricType'
+        uint8_t metricType;
+        /// [out] an array of 'NVPW_Submetric'
+        const uint16_t* pSupportedSubmetrics;
+        /// [out]
+        size_t numSupportedSubmetrics;
+    } NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params;
+#define NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params, numSupportedSubmetrics)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetSupportedSubmetrics(NVPW_MetricsEvaluator_GetSupportedSubmetrics_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricRawDependencies_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequests;
+        /// [in]
+        size_t numMetricEvalRequests;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [in] set to sizeof('NVPW_MetricEvalRequest')
+        size_t metricEvalRequestStrideSize;
+        /// [inout] 'ppRawDependencies' is in, '*ppRawDependencies' is out
+        const char** ppRawDependencies;
+        /// [inout] if 'ppRawDependencies' is NULL, number of raw dependencies available will be returned; otherwise it
+        /// should be set to the number of elements allocated for 'ppRawDependencies', and on return, it will be
+        /// overwritten by number of elements copied to 'ppRawDependencies'
+        size_t numRawDependencies;
+        /// [inout] 'ppOptionalRawDependencies' is in, '*ppOptionalRawDependencies' is out
+        const char** ppOptionalRawDependencies;
+        /// [inout] if 'ppOptionalRawDependencies' is NULL, number of optional raw dependencies available will be
+        /// returned; otherwise it should be set to the number of elements allocated for 'ppOptionalRawDependencies',
+        /// and on return, it will be overwritten by number of elements copied to 'ppOptionalRawDependencies'
+        size_t numOptionalRawDependencies;
+    } NVPW_MetricsEvaluator_GetMetricRawDependencies_Params;
+#define NVPW_MetricsEvaluator_GetMetricRawDependencies_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricRawDependencies_Params, numOptionalRawDependencies)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricRawDependencies(NVPW_MetricsEvaluator_GetMetricRawDependencies_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_DimUnitToString_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] one of 'NVPW_DimUnitName'
+        uint32_t dimUnit;
+        /// [out]
+        const char* pSingularName;
+        /// [out]
+        const char* pPluralName;
+    } NVPW_MetricsEvaluator_DimUnitToString_Params;
+#define NVPW_MetricsEvaluator_DimUnitToString_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_DimUnitToString_Params, pPluralName)
+
+    NVPA_Status NVPW_MetricsEvaluator_DimUnitToString(NVPW_MetricsEvaluator_DimUnitToString_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_GetMetricDimUnits_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequest;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [inout] 'pDimUnits' is in, '*pDimUnits' is out
+        NVPW_DimUnitFactor* pDimUnits;
+        /// [inout] if 'pDimUnits' is NULL, number of dim-units available will be returned; otherwise it should be set
+        /// to the number of elements allocated for 'pDimUnits', and on return, it will be overwritten by number of
+        /// elements copied to 'pDimUnits'
+        size_t numDimUnits;
+        /// [in] set to 'NVPW_DimUnitFactor_STRUCT_SIZE'
+        size_t dimUnitFactorStructSize;
+    } NVPW_MetricsEvaluator_GetMetricDimUnits_Params;
+#define NVPW_MetricsEvaluator_GetMetricDimUnits_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_GetMetricDimUnits_Params, dimUnitFactorStructSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_GetMetricDimUnits(NVPW_MetricsEvaluator_GetMetricDimUnits_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_SetUserData_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in] duration in ns of user defined frame
+        double frameDuration;
+        /// [in] duration in ns of user defined region
+        double regionDuration;
+        /// [in]
+        NVPA_Bool isolated;
+    } NVPW_MetricsEvaluator_SetUserData_Params;
+#define NVPW_MetricsEvaluator_SetUserData_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_SetUserData_Params, isolated)
+
+    NVPA_Status NVPW_MetricsEvaluator_SetUserData(NVPW_MetricsEvaluator_SetUserData_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_EvaluateToGpuValues_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const struct NVPW_MetricEvalRequest* pMetricEvalRequests;
+        /// [in]
+        size_t numMetricEvalRequests;
+        /// [in] set to 'NVPW_MetricEvalRequest_STRUCT_SIZE'
+        size_t metricEvalRequestStructSize;
+        /// [in] set to sizeof('NVPW_MetricEvalRequest')
+        size_t metricEvalRequestStrideSize;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [in]
+        NVPA_Bool isolated;
+        /// [inout] 'pMetricValues' is in, '*pMetricValues' is out
+        double* pMetricValues;
+    } NVPW_MetricsEvaluator_EvaluateToGpuValues_Params;
+#define NVPW_MetricsEvaluator_EvaluateToGpuValues_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_EvaluateToGpuValues_Params, pMetricValues)
+
+    NVPA_Status NVPW_MetricsEvaluator_EvaluateToGpuValues(NVPW_MetricsEvaluator_EvaluateToGpuValues_Params* pParams);
+
+    typedef struct NVPW_MetricsEvaluator_SetDeviceAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+    } NVPW_MetricsEvaluator_SetDeviceAttributes_Params;
+#define NVPW_MetricsEvaluator_SetDeviceAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_MetricsEvaluator_SetDeviceAttributes_Params, counterDataImageSize)
+
+    NVPA_Status NVPW_MetricsEvaluator_SetDeviceAttributes(NVPW_MetricsEvaluator_SetDeviceAttributes_Params* pParams);
+
+/**
+ *  @}
+ ******************************************************************************/
+ 
+
+#endif // NVPERF_HOST_API_DEFINED
+
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_HOST_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdced20ae6c03b2923d95c9ce42cdbe9cf86db08
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/include/nvperf_target.h
@@ -0,0 +1,597 @@
+#ifndef NVPERF_TARGET_H
+#define NVPERF_TARGET_H
+
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ *  @file   nvperf_target.h
+ */
+
+#ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+#define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+    /// GPU architecture support level
+    typedef enum NVPW_GpuArchitectureSupportLevel
+    {
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED
+    } NVPW_GpuArchitectureSupportLevel;
+#endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED
+#define NVPW_SLI_SUPPORT_LEVEL_DEFINED
+    /// SLI configuration support level
+    typedef enum NVPW_SliSupportLevel
+    {
+        NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Only Non-SLI configurations are supported.
+        NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION
+    } NVPW_SliSupportLevel;
+#endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+#define NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+    /// Virtualized GPU configuration support level
+    typedef enum NVPW_VGpuSupportLevel
+    {
+        NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Supported but not allowed by system admin.
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION
+    } NVPW_VGpuSupportLevel;
+#endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+#define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+    /// Confidential Compute mode support level
+    typedef enum NVPW_ConfidentialComputeSupportLevel
+    {
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_CONF_COMPUTE_DEVTOOLS_MODE
+    } NVPW_ConfidentialComputeSupportLevel;
+#endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED
+#define NVPW_CMP_SUPPORT_LEVEL_DEFINED
+    /// CMP support level
+    typedef enum NVPW_CmpSupportLevel
+    {
+        NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON
+    } NVPW_CmpSupportLevel;
+#endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED
+
+#ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED
+#define NVPW_WSL_SUPPORT_LEVEL_DEFINED
+    /// WSL support level
+    typedef enum NVPW_WslSupportLevel
+    {
+        NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION
+    } NVPW_WslSupportLevel;
+#endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED
+
+    typedef struct NVPW_InitializeTarget_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeTarget_Params;
+#define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv)
+
+    /// Load the target library.
+    NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams);
+
+    typedef struct NVPW_GetDeviceCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t numDevices;
+    } NVPW_GetDeviceCount_Params;
+#define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices)
+
+    NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams);
+
+    typedef struct NVPW_Device_GetNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        const char* pDeviceName;
+        const char* pChipName;
+    } NVPW_Device_GetNames_Params;
+#define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName)
+
+    NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams);
+
+    typedef struct NVPW_PciBusId
+    {
+        /// The PCI domain on which the device bus resides.
+        uint32_t domain;
+        ///  The bus on which the device resides.
+        uint16_t bus;
+        /// device ID.
+        uint16_t device;
+    } NVPW_PciBusId;
+#define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device)
+
+    typedef struct NVPW_Device_GetPciBusIds_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex
+        NVPW_PciBusId* pBusIds;
+        /// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount
+        size_t numDevices;
+    } NVPW_Device_GetPciBusIds_Params;
+#define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices)
+
+    NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams);
+
+
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID     0xFFFFFFFFu
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP    0xFFFFFFFEu
+
+
+    typedef struct NVPW_Device_GetMigAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        size_t deviceIndex;
+        /// [out]
+        NVPA_Bool isMigPartition;
+        /// [out]
+        uint32_t gpuInstanceId;
+        /// [out]
+        uint32_t computeInstanceId;
+    } NVPW_Device_GetMigAttributes_Params;
+#define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId)
+
+    NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams);
+
+    typedef struct NVPW_Adapter_GetDeviceIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct IDXGIAdapter* pAdapter;
+        /// [in]
+        size_t sliIndex;
+        /// [out]
+        size_t deviceIndex;
+    } NVPW_Adapter_GetDeviceIndex_Params;
+#define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex)
+
+    NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams);
+
+    typedef struct NVPW_CounterData_GetNumRanges_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t numRanges;
+    } NVPW_CounterData_GetNumRanges_Params;
+#define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges)
+
+    NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams);
+
+    typedef struct NVPW_CounterData_GetChipName_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        const char* pChipName;
+    } NVPW_CounterData_GetChipName_Params;
+#define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName)
+
+    NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams);
+
+    typedef struct NVPW_Config_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_Config_GetNumPasses_Params;
+#define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses)
+
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams);
+
+    typedef struct NVPW_Config_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_Config_GetNumPasses_V2_Params;
+#define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses)
+
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams);
+
+#define NVPW_API_SET_CUDA_PROFILER             0x18209d0775b2f89dULL
+
+#define NVPW_API_SET_D3D11_PROFILER            0xca55c6738445db2bULL
+
+#define NVPW_API_SET_D3D12_PROFILER            0xc0c2d46dd7c7ad78ULL
+
+#define NVPW_API_SET_EGL_PROFILER              0x3c3747dae1f9565cULL
+
+#define NVPW_API_SET_GPU_PERIODICSAMPLER       0x9f4c2571fc0b2e8aULL
+
+#define NVPW_API_SET_METRICSCONTEXT            0x7c8579f6f2144beaULL
+
+#define NVPW_API_SET_METRICSEVALUATOR          0x0368a8768d811af9ULL
+
+#define NVPW_API_SET_METRICS_AD10X_COMP        0xbe57278e12cb5288ULL
+
+#define NVPW_API_SET_METRICS_AD10X_GRFX        0x5cbf0774f81bf491ULL
+
+#define NVPW_API_SET_METRICS_GA100_COMP        0x16b7d8c20d8b4915ULL
+
+#define NVPW_API_SET_METRICS_GA100_GRFX        0xc94eaabec04a94faULL
+
+#define NVPW_API_SET_METRICS_GA10X_COMP        0xb5d6391c2e299ab5ULL
+
+#define NVPW_API_SET_METRICS_GA10X_GRFX        0x6ebc121178b5ce0bULL
+
+#define NVPW_API_SET_METRICS_GV100_COMP        0x863705cc57919f72ULL
+
+#define NVPW_API_SET_METRICS_GV100_GRFX        0x9900da75d164fecfULL
+
+#define NVPW_API_SET_METRICS_GV11B_COMP        0xd3f79a859235848fULL
+
+#define NVPW_API_SET_METRICS_GV11B_GRFX        0xeb8e26220106e227ULL
+
+#define NVPW_API_SET_METRICS_TU10X_COMP        0x70f40be0afd35da8ULL
+
+#define NVPW_API_SET_METRICS_TU10X_GRFX        0xdf219cb838db6968ULL
+
+#define NVPW_API_SET_METRICS_TU11X_COMP        0xeb0069d7d0956678ULL
+
+#define NVPW_API_SET_METRICS_TU11X_GRFX        0x0977d9342bd62743ULL
+
+#define NVPW_API_SET_OPENGL_PROFILER           0xe4cd9ea40f2ee777ULL
+
+#define NVPW_API_SET_VULKAN_PROFILER           0x8c56b6a03d779689ULL
+
+#define NVPW_SDK_VERSION               0x1e128b6f001423fcULL
+
+    typedef struct NVPW_QueryVersionNumber_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint64_t apiSet;
+        /// [out]
+        uint32_t major;
+        /// [out]
+        uint32_t minor;
+        /// [out]
+        uint32_t patch;
+        /// [out]
+        uint32_t relMajor;
+        /// [out]
+        uint32_t relMinor;
+        /// [out]
+        uint32_t relPatch;
+    } NVPW_QueryVersionNumber_Params;
+#define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch)
+
+    /// Query version number of an API set
+    NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams);
+
+    typedef enum NVPW_Device_ClockStatus
+    {
+        /// clock status is unknown
+        NVPW_DEVICE_CLOCK_STATUS_UNKNOWN,
+        /// clocks are locked to rated tdp values
+        NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP,
+        /// clocks are not locked and can boost above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED,
+        /// clocks are not locked and will not go above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED,
+        NVPW_DEVICE_CLOCK_STATUS__COUNT
+    } NVPW_Device_ClockStatus;
+
+    typedef struct NVPW_Device_GetClockStatus_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockStatus clockStatus;
+    } NVPW_Device_GetClockStatus_Params;
+#define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockStatus)
+
+    NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams);
+
+    typedef enum NVPW_Device_ClockSetting
+    {
+        /// invalid op, specify valid clocks operation during profiling
+        NVPW_DEVICE_CLOCK_SETTING_INVALID,
+        /// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or
+        /// locked to rated TDP)
+        NVPW_DEVICE_CLOCK_SETTING_DEFAULT,
+        /// lock clocks at rated tdp base values
+        NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP,
+        NVPW_DEVICE_CLOCK_SETTING__COUNT
+    } NVPW_Device_ClockSetting;
+
+    typedef struct NVPW_Device_SetClockSetting_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockSetting clockSetting;
+    } NVPW_Device_SetClockSetting_Params;
+#define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting)
+
+    NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams);
+
+    typedef struct NVPW_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_CounterData_GetRangeDescriptions_Params;
+#define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+
+    NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams);
+
+    typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_Profiler_CounterData_GetRangeDescriptions_Params;
+#define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+
+    NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams);
+
+#ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+#define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+    typedef enum NVPW_PeriodicSampler_CounterData_AppendMode
+    {
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT
+    } NVPW_PeriodicSampler_CounterData_AppendMode;
+#endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+
+    typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint64_t timestampStart;
+        /// [out]
+        uint64_t timestampEnd;
+    } NVPW_PeriodicSampler_CounterData_GetSampleTime_Params;
+#define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd)
+
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        size_t counterDataImageTrimmedSize;
+    } NVPW_PeriodicSampler_CounterData_TrimInPlace_Params;
+#define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize)
+
+    NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out] total number of ranges in the counter data
+        size_t numTotalRanges;
+        /// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode,
+        /// then it returns the last "populated" range index + 1, when there is no such range, it returns 0.
+        size_t numPopulatedRanges;
+        /// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode,
+        /// then it returns the last "completed" range index + 1, when there is no such range, it returns 0.
+        size_t numCompletedRanges;
+    } NVPW_PeriodicSampler_CounterData_GetInfo_Params;
+#define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges)
+
+    /// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic
+    /// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all
+    /// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts
+    /// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been
+    /// written; on the other hand, a range is only considered "completed" if all the collecting counters have been
+    /// written.
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint32_t triggerCount;
+    } NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params;
+#define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount)
+
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams);
+
+    typedef struct NVPW_PeriodicSampler_CounterData_IsDataComplete_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        NVPA_Bool isComplete;
+    } NVPW_PeriodicSampler_CounterData_IsDataComplete_Params;
+#define NVPW_PeriodicSampler_CounterData_IsDataComplete_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_IsDataComplete_Params, isComplete)
+
+    /// Checks whether a given sample's data is complete. See also 'NVPW_PeriodicSampler_CounterData_GetInfo'
+    NVPA_Status NVPW_PeriodicSampler_CounterData_IsDataComplete(NVPW_PeriodicSampler_CounterData_IsDataComplete_Params* pParams);
+
+
+    typedef struct NVPW_TimestampReport
+    {
+        uint32_t payload;
+        uint8_t reserved0004[4];
+        uint64_t timestamp;
+    } NVPW_TimestampReport;
+
+
+
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+
+#endif // NVPERF_TARGET_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21569c40a65a83c565a9cb662561abdf71ef2c84
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_cupti/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc-builtins.so.12.4 b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc-builtins.so.12.4
new file mode 100644
index 0000000000000000000000000000000000000000..e3f750de0f456864016a0e41ac24bdcfc9d4a42f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc-builtins.so.12.4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4425561a319da07bbde0dc180a90422e351b364c3dc4f6a3ea1e69925d62db72
+size 5343112
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b39befa48f75c0d838dbcc6cf468f18b87abd71
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cudnn/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5defdd4ac472597949baae1811f4de817feca7cb
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cufft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8503de9883e5d890792932b9b136daae859ea3f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cufft/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9935dc135ba13ed6760fe2e34d5dacb0b2f34a99
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h
new file mode 100644
index 0000000000000000000000000000000000000000..94fcf4745fafa04f57678ba5ee64103f8ebd6444
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cudalibxt.h
@@ -0,0 +1,97 @@
+ /* Copyright 2013,2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cudalibxt.h  
+* \brief Public header file for the NVIDIA library multi-GPU support structures  
+*/ 
+
+#ifndef _CUDA_LIB_XT_H_
+#define _CUDA_LIB_XT_H_
+#include <cuda_runtime.h>
+
+#define CUDA_XT_DESCRIPTOR_VERSION 0x01000000 // This is added to CUDART_VERSION
+
+enum cudaXtCopyType_t {
+    LIB_XT_COPY_HOST_TO_DEVICE,
+    LIB_XT_COPY_DEVICE_TO_HOST,
+    LIB_XT_COPY_DEVICE_TO_DEVICE
+} ;
+typedef enum cudaXtCopyType_t cudaLibXtCopyType;
+
+enum libFormat_t {
+    LIB_FORMAT_CUFFT        = 0x0,
+    LIB_FORMAT_UNDEFINED    = 0x1
+};
+
+typedef enum libFormat_t libFormat;
+
+#define MAX_CUDA_DESCRIPTOR_GPUS 64
+
+struct cudaXtDesc_t{
+    int version;                             //descriptor version
+    int nGPUs;                               //number of GPUs 
+    int GPUs[MAX_CUDA_DESCRIPTOR_GPUS];      //array of device IDs
+    void *data[MAX_CUDA_DESCRIPTOR_GPUS];    //array of pointers to data, one per GPU
+    size_t size[MAX_CUDA_DESCRIPTOR_GPUS];   //array of data sizes, one per GPU
+    void *cudaXtState;                       //opaque CUDA utility structure
+};
+typedef struct cudaXtDesc_t cudaXtDesc;
+
+struct cudaLibXtDesc_t{
+    int version;                //descriptor version
+    cudaXtDesc *descriptor;     //multi-GPU memory descriptor
+    libFormat library;          //which library recognizes the format
+    int subFormat;              //library specific enumerator of sub formats
+    void *libDescriptor;        //library specific descriptor e.g. FFT transform plan object
+};
+typedef struct cudaLibXtDesc_t cudaLibXtDesc;
+
+
+#endif
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2e1f3282d2d3ededc9540c169449c1dc7139b24
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufft.h
@@ -0,0 +1,334 @@
+ /* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cufft.h
+* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
+*/
+
+#ifndef _CUFFT_H_
+#define _CUFFT_H_
+
+
+#include "cuComplex.h"
+#include "driver_types.h"
+#include "library_types.h"
+
+#ifndef CUFFTAPI
+#ifdef _WIN32
+#define CUFFTAPI __stdcall
+#elif __GNUC__ >= 4
+#define CUFFTAPI __attribute__ ((visibility ("default")))
+#else
+#define CUFFTAPI
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CUFFT_VER_MAJOR 11
+#define CUFFT_VER_MINOR 2
+#define CUFFT_VER_PATCH 1
+#define CUFFT_VER_BUILD 3
+
+#define CUFFT_VERSION 11201
+
+// CUFFT API function return values
+typedef enum cufftResult_t {
+  CUFFT_SUCCESS        = 0x0,
+  CUFFT_INVALID_PLAN   = 0x1,
+  CUFFT_ALLOC_FAILED   = 0x2,
+  CUFFT_INVALID_TYPE   = 0x3,
+  CUFFT_INVALID_VALUE  = 0x4,
+  CUFFT_INTERNAL_ERROR = 0x5,
+  CUFFT_EXEC_FAILED    = 0x6,
+  CUFFT_SETUP_FAILED   = 0x7,
+  CUFFT_INVALID_SIZE   = 0x8,
+  CUFFT_UNALIGNED_DATA = 0x9,
+  CUFFT_INCOMPLETE_PARAMETER_LIST = 0xA,
+  CUFFT_INVALID_DEVICE = 0xB,
+  CUFFT_PARSE_ERROR = 0xC,
+  CUFFT_NO_WORKSPACE = 0xD,
+  CUFFT_NOT_IMPLEMENTED = 0xE,
+  CUFFT_LICENSE_ERROR = 0x0F,
+  CUFFT_NOT_SUPPORTED = 0x10
+
+} cufftResult;
+
+#define MAX_CUFFT_ERROR 0x11
+
+
+// CUFFT defines and supports the following data types
+
+
+// cufftReal is a single-precision, floating-point real data type.
+// cufftDoubleReal is a double-precision, real data type.
+typedef float cufftReal;
+typedef double cufftDoubleReal;
+
+// cufftComplex is a single-precision, floating-point complex data type that
+// consists of interleaved real and imaginary components.
+// cufftDoubleComplex is the double-precision equivalent.
+typedef cuComplex cufftComplex;
+typedef cuDoubleComplex cufftDoubleComplex;
+
+// CUFFT transform directions
+#define CUFFT_FORWARD -1 // Forward FFT
+#define CUFFT_INVERSE  1 // Inverse FFT
+
+// CUFFT supports the following transform types
+typedef enum cufftType_t {
+  CUFFT_R2C = 0x2a,     // Real to Complex (interleaved)
+  CUFFT_C2R = 0x2c,     // Complex (interleaved) to Real
+  CUFFT_C2C = 0x29,     // Complex to Complex, interleaved
+  CUFFT_D2Z = 0x6a,     // Double to Double-Complex
+  CUFFT_Z2D = 0x6c,     // Double-Complex to Double
+  CUFFT_Z2Z = 0x69      // Double-Complex to Double-Complex
+} cufftType;
+
+// CUFFT supports the following data layouts
+typedef enum cufftCompatibility_t {
+    CUFFT_COMPATIBILITY_FFTW_PADDING    = 0x01    // The default value
+} cufftCompatibility;
+
+#define CUFFT_COMPATIBILITY_DEFAULT   CUFFT_COMPATIBILITY_FFTW_PADDING
+
+//
+// structure definition used by the shim between old and new APIs
+//
+#define MAX_SHIM_RANK 3
+
+// cufftHandle is a handle type used to store and access CUFFT plans.
+typedef int cufftHandle;
+
+
+cufftResult CUFFTAPI cufftPlan1d(cufftHandle *plan,
+                                 int nx,
+                                 cufftType type,
+                                 int batch);
+
+cufftResult CUFFTAPI cufftPlan2d(cufftHandle *plan,
+                                 int nx, int ny,
+                                 cufftType type);
+
+cufftResult CUFFTAPI cufftPlan3d(cufftHandle *plan,
+                                 int nx, int ny, int nz,
+                                 cufftType type);
+
+cufftResult CUFFTAPI cufftPlanMany(cufftHandle *plan,
+                                   int rank,
+                                   int *n,
+                                   int *inembed, int istride, int idist,
+                                   int *onembed, int ostride, int odist,
+                                   cufftType type,
+                                   int batch);
+
+cufftResult CUFFTAPI cufftMakePlan1d(cufftHandle plan,
+                                     int nx,
+                                     cufftType type,
+                                     int batch,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlan2d(cufftHandle plan,
+                                     int nx, int ny,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlan3d(cufftHandle plan,
+                                     int nx, int ny, int nz,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlanMany(cufftHandle plan,
+                                       int rank,
+                                       int *n,
+                                       int *inembed, int istride, int idist,
+                                       int *onembed, int ostride, int odist,
+                                       cufftType type,
+                                       int batch,
+                                       size_t *workSize);
+
+cufftResult CUFFTAPI cufftMakePlanMany64(cufftHandle plan,
+                                         int rank,
+                                         long long int *n,
+                                         long long int *inembed,
+                                         long long int istride,
+                                         long long int idist,
+                                         long long int *onembed,
+                                         long long int ostride, long long int odist,
+                                         cufftType type,
+                                         long long int batch,
+                                         size_t * workSize);
+
+cufftResult CUFFTAPI cufftGetSizeMany64(cufftHandle plan,
+                                        int rank,
+                                        long long int *n,
+                                        long long int *inembed,
+                                        long long int istride, long long int idist,
+                                        long long int *onembed,
+                                        long long int ostride, long long int odist,
+                                        cufftType type,
+                                        long long int batch,
+                                        size_t *workSize);
+
+
+
+
+cufftResult CUFFTAPI cufftEstimate1d(int nx,
+                                     cufftType type,
+                                     int batch,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftEstimate2d(int nx, int ny,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftEstimate3d(int nx, int ny, int nz,
+                                     cufftType type,
+                                     size_t *workSize);
+
+cufftResult CUFFTAPI cufftEstimateMany(int rank,
+                                       int *n,
+                                       int *inembed, int istride, int idist,
+                                       int *onembed, int ostride, int odist,
+                                       cufftType type,
+                                       int batch,
+                                       size_t *workSize);
+
+cufftResult CUFFTAPI cufftCreate(cufftHandle * handle);
+
+cufftResult CUFFTAPI cufftGetSize1d(cufftHandle handle,
+                                    int nx,
+                                    cufftType type,
+                                    int batch,
+                                    size_t *workSize );
+
+cufftResult CUFFTAPI cufftGetSize2d(cufftHandle handle,
+                                    int nx, int ny,
+                                    cufftType type,
+                                    size_t *workSize);
+
+cufftResult CUFFTAPI cufftGetSize3d(cufftHandle handle,
+                                    int nx, int ny, int nz,
+                                    cufftType type,
+                                    size_t *workSize);
+
+cufftResult CUFFTAPI cufftGetSizeMany(cufftHandle handle,
+                                      int rank, int *n,
+                                      int *inembed, int istride, int idist,
+                                      int *onembed, int ostride, int odist,
+                                      cufftType type, int batch, size_t *workArea);
+
+cufftResult CUFFTAPI cufftGetSize(cufftHandle handle, size_t *workSize);
+
+cufftResult CUFFTAPI cufftSetWorkArea(cufftHandle plan, void *workArea);
+
+cufftResult CUFFTAPI cufftSetAutoAllocation(cufftHandle plan, int autoAllocate);
+
+cufftResult CUFFTAPI cufftExecC2C(cufftHandle plan,
+                                  cufftComplex *idata,
+                                  cufftComplex *odata,
+                                  int direction);
+
+cufftResult CUFFTAPI cufftExecR2C(cufftHandle plan,
+                                  cufftReal *idata,
+                                  cufftComplex *odata);
+
+cufftResult CUFFTAPI cufftExecC2R(cufftHandle plan,
+                                  cufftComplex *idata,
+                                  cufftReal *odata);
+
+cufftResult CUFFTAPI cufftExecZ2Z(cufftHandle plan,
+                                  cufftDoubleComplex *idata,
+                                  cufftDoubleComplex *odata,
+                                  int direction);
+
+cufftResult CUFFTAPI cufftExecD2Z(cufftHandle plan,
+                                  cufftDoubleReal *idata,
+                                  cufftDoubleComplex *odata);
+
+cufftResult CUFFTAPI cufftExecZ2D(cufftHandle plan,
+                                  cufftDoubleComplex *idata,
+                                  cufftDoubleReal *odata);
+
+
+// utility functions
+cufftResult CUFFTAPI cufftSetStream(cufftHandle plan,
+                                    cudaStream_t stream);
+
+cufftResult CUFFTAPI cufftDestroy(cufftHandle plan);
+
+cufftResult CUFFTAPI cufftGetVersion(int *version);
+
+cufftResult CUFFTAPI cufftGetProperty(libraryPropertyType type,
+                                      int *value);
+
+//
+// Set/Get PlanProperty APIs configures per-plan behavior 
+//
+typedef enum cufftProperty_t {
+    NVFFT_PLAN_PROPERTY_INT64_PATIENT_JIT = 0x1
+} cufftProperty;
+
+cufftResult CUFFTAPI cufftSetPlanPropertyInt64(cufftHandle plan, 
+                                               cufftProperty property, 
+                                               const long long int inputValueInt);
+
+cufftResult CUFFTAPI cufftGetPlanPropertyInt64(cufftHandle plan, 
+                                               cufftProperty property, 
+                                               long long int* returnPtrValue);
+
+cufftResult CUFFTAPI cufftResetPlanProperty(cufftHandle plan, cufftProperty property);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CUFFT_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a22724261d27ae0e5e894e4e2f60ef484b52bf2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftXt.h
@@ -0,0 +1,259 @@
+
+ /* Copyright 2005-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cufftXt.h
+* \brief Public header file for the NVIDIA CUDA FFT library (CUFFT)
+*/
+
+#ifndef _CUFFTXT_H_
+#define _CUFFTXT_H_
+#include "cudalibxt.h"
+#include "cufft.h"
+
+
+#ifndef CUFFTAPI
+#ifdef _WIN32
+#define CUFFTAPI __stdcall
+#else
+#define CUFFTAPI
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// cufftXtSubFormat identifies the data layout of
+// a memory descriptor owned by cufft.
+// note that multi GPU cufft does not yet support out-of-place transforms
+//
+
+typedef enum cufftXtSubFormat_t {
+    CUFFT_XT_FORMAT_INPUT = 0x00,              //by default input is in linear order across GPUs
+    CUFFT_XT_FORMAT_OUTPUT = 0x01,             //by default output is in scrambled order depending on transform
+    CUFFT_XT_FORMAT_INPLACE = 0x02,            //by default inplace is input order, which is linear across GPUs
+    CUFFT_XT_FORMAT_INPLACE_SHUFFLED = 0x03,   //shuffled output order after execution of the transform
+    CUFFT_XT_FORMAT_1D_INPUT_SHUFFLED = 0x04,  //shuffled input order prior to execution of 1D transforms
+    CUFFT_XT_FORMAT_DISTRIBUTED_INPUT = 0x05,
+    CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT = 0x06,
+    CUFFT_FORMAT_UNDEFINED = 0x07
+} cufftXtSubFormat;
+
+//
+// cufftXtCopyType specifies the type of copy for cufftXtMemcpy
+//
+typedef enum cufftXtCopyType_t {
+    CUFFT_COPY_HOST_TO_DEVICE = 0x00,
+    CUFFT_COPY_DEVICE_TO_HOST = 0x01,
+    CUFFT_COPY_DEVICE_TO_DEVICE = 0x02,
+    CUFFT_COPY_UNDEFINED = 0x03
+} cufftXtCopyType;
+
+//
+// cufftXtQueryType specifies the type of query for cufftXtQueryPlan
+//
+typedef enum cufftXtQueryType_t {
+    CUFFT_QUERY_1D_FACTORS = 0x00,
+    CUFFT_QUERY_UNDEFINED = 0x01
+} cufftXtQueryType;
+
+typedef struct cufftXt1dFactors_t {
+    long long int size;
+    long long int stringCount;
+    long long int stringLength;
+    long long int substringLength;
+    long long int factor1;
+    long long int factor2;
+    long long int stringMask;
+    long long int substringMask;
+    long long int factor1Mask;
+    long long int factor2Mask;
+    int stringShift;
+    int substringShift;
+    int factor1Shift;
+    int factor2Shift;
+} cufftXt1dFactors;
+
+//
+// cufftXtWorkAreaPolicy specifies policy for cufftXtSetWorkAreaPolicy
+//
+typedef enum cufftXtWorkAreaPolicy_t {
+    CUFFT_WORKAREA_MINIMAL = 0, /* maximum reduction */
+    CUFFT_WORKAREA_USER = 1, /* use workSize parameter as limit */
+    CUFFT_WORKAREA_PERFORMANCE = 2, /* default - 1x overhead or more, maximum performance */
+} cufftXtWorkAreaPolicy;
+
+// multi-GPU routines
+cufftResult CUFFTAPI cufftXtSetGPUs(cufftHandle handle, int nGPUs, int *whichGPUs);
+
+cufftResult CUFFTAPI cufftXtMalloc(cufftHandle plan,
+                                   cudaLibXtDesc ** descriptor,
+                                   cufftXtSubFormat format);
+
+cufftResult CUFFTAPI cufftXtMemcpy(cufftHandle plan,
+                                   void *dstPointer,
+                                   void *srcPointer,
+                                   cufftXtCopyType type);
+
+cufftResult CUFFTAPI cufftXtFree(cudaLibXtDesc *descriptor);
+
+cufftResult CUFFTAPI cufftXtSetWorkArea(cufftHandle plan, void **workArea);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorC2C(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output,
+                                              int direction);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorR2C(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorC2R(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorZ2Z(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output,
+                                              int direction);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorD2Z(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+cufftResult CUFFTAPI cufftXtExecDescriptorZ2D(cufftHandle plan,
+                                              cudaLibXtDesc *input,
+                                              cudaLibXtDesc *output);
+
+// Utility functions
+
+cufftResult CUFFTAPI cufftXtQueryPlan(cufftHandle plan, void *queryStruct, cufftXtQueryType queryType);
+
+
+// callbacks
+
+
+typedef enum cufftXtCallbackType_t {
+    CUFFT_CB_LD_COMPLEX = 0x0,
+    CUFFT_CB_LD_COMPLEX_DOUBLE = 0x1,
+    CUFFT_CB_LD_REAL = 0x2,
+    CUFFT_CB_LD_REAL_DOUBLE = 0x3,
+    CUFFT_CB_ST_COMPLEX = 0x4,
+    CUFFT_CB_ST_COMPLEX_DOUBLE = 0x5,
+    CUFFT_CB_ST_REAL = 0x6,
+    CUFFT_CB_ST_REAL_DOUBLE = 0x7,
+    CUFFT_CB_UNDEFINED = 0x8
+
+} cufftXtCallbackType;
+
+typedef cufftComplex (*cufftCallbackLoadC)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+typedef cufftDoubleComplex (*cufftCallbackLoadZ)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+typedef cufftReal (*cufftCallbackLoadR)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+typedef cufftDoubleReal(*cufftCallbackLoadD)(void *dataIn, size_t offset, void *callerInfo, void *sharedPointer);
+
+typedef void (*cufftCallbackStoreC)(void *dataOut, size_t offset, cufftComplex element, void *callerInfo, void *sharedPointer);
+typedef void (*cufftCallbackStoreZ)(void *dataOut, size_t offset, cufftDoubleComplex element, void *callerInfo, void *sharedPointer);
+typedef void (*cufftCallbackStoreR)(void *dataOut, size_t offset, cufftReal element, void *callerInfo, void *sharedPointer);
+typedef void (*cufftCallbackStoreD)(void *dataOut, size_t offset, cufftDoubleReal element, void *callerInfo, void *sharedPointer);
+
+
+cufftResult CUFFTAPI cufftXtSetCallback(cufftHandle plan, void **callback_routine, cufftXtCallbackType cbType, void **caller_info);
+cufftResult CUFFTAPI cufftXtClearCallback(cufftHandle plan, cufftXtCallbackType cbType);
+cufftResult CUFFTAPI cufftXtSetCallbackSharedSize(cufftHandle plan, cufftXtCallbackType cbType, size_t sharedSize);
+
+cufftResult CUFFTAPI cufftXtMakePlanMany(cufftHandle plan,
+                                         int rank,
+                                         long long int *n,
+                                         long long int *inembed,
+                                         long long int istride,
+                                         long long int idist,
+                                         cudaDataType inputtype,
+                                         long long int *onembed,
+                                         long long int ostride,
+                                         long long int odist,
+                                         cudaDataType outputtype,
+                                         long long int batch,
+                                         size_t *workSize,
+                                       	 cudaDataType executiontype);
+
+cufftResult CUFFTAPI cufftXtGetSizeMany(cufftHandle plan,
+                                        int rank,
+                                        long long int *n,
+                                        long long int *inembed,
+                                        long long int istride,
+                                        long long int idist,
+                                        cudaDataType inputtype,
+                                        long long int *onembed,
+                                        long long int ostride,
+                                        long long int odist,
+                                        cudaDataType outputtype,
+                                        long long int batch,
+                                        size_t *workSize,
+                                        cudaDataType executiontype);
+
+
+cufftResult CUFFTAPI cufftXtExec(cufftHandle plan,
+                                 void *input,
+                                 void *output,
+                                 int direction);
+
+cufftResult CUFFTAPI cufftXtExecDescriptor(cufftHandle plan,
+                                           cudaLibXtDesc *input,
+                                           cudaLibXtDesc *output,
+                                           int direction);
+
+cufftResult CUFFTAPI cufftXtSetWorkAreaPolicy(cufftHandle plan, cufftXtWorkAreaPolicy policy, size_t *workSize);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf72370f777bc0aaa76dddae9a34efd667a5922
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cufft/include/cufftw.h
@@ -0,0 +1,465 @@
+
+ /* Copyright 2005-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*!
+* \file cufftw.h
+* \brief Public header file for the NVIDIA CUDA FFTW library (CUFFTW)
+*/
+
+#ifndef _CUFFTW_H_
+#define _CUFFTW_H_
+
+
+#include <stdio.h>
+#include "cufft.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Transform direction
+#define FFTW_FORWARD -1
+#define FFTW_INVERSE  1
+#define FFTW_BACKWARD 1
+
+// Planner flags
+#define FFTW_ESTIMATE           0x01
+#define FFTW_MEASURE            0x02
+#define FFTW_PATIENT            0x03
+#define FFTW_EXHAUSTIVE         0x04
+#define FFTW_WISDOM_ONLY        0x05
+
+// Algorithm restriction flags
+#define FFTW_DESTROY_INPUT      0x08
+#define FFTW_PRESERVE_INPUT     0x0C
+#define FFTW_UNALIGNED          0x10
+    
+// CUFFTW defines and supports the following data types
+
+// note if complex.h has been included we use the C99 complex types
+#if !defined(FFTW_NO_Complex) && defined(_Complex_I) && defined (complex)
+  typedef double _Complex fftw_complex;
+  typedef float _Complex fftwf_complex;
+#else
+  typedef double fftw_complex[2];
+  typedef float fftwf_complex[2];
+#endif
+
+typedef void *fftw_plan;
+
+typedef void *fftwf_plan;
+
+typedef struct {
+  int n;
+  int is;
+  int os;
+} fftw_iodim;
+
+typedef fftw_iodim fftwf_iodim;
+
+typedef struct {
+  ptrdiff_t n;
+  ptrdiff_t is;
+  ptrdiff_t os;
+} fftw_iodim64;
+
+typedef fftw_iodim64 fftwf_iodim64;
+
+// CUFFTW defines and supports the following double precision APIs
+
+fftw_plan CUFFTAPI fftw_plan_dft_1d(int n, 
+                                    fftw_complex *in,
+                                    fftw_complex *out, 
+                                    int sign, 
+                                    unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_2d(int n0,
+                                    int n1, 
+                                    fftw_complex *in,
+                                    fftw_complex *out, 
+                                    int sign, 
+                                    unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_3d(int n0,
+                                    int n1,
+                                    int n2, 
+                                    fftw_complex *in,
+                                    fftw_complex *out, 
+                                    int sign, 
+                                    unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft(int rank,
+                                 const int *n,
+                                 fftw_complex *in,
+                                 fftw_complex *out, 
+                                 int sign, 
+                                 unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c_1d(int n, 
+                                        double *in,
+                                        fftw_complex *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c_2d(int n0,
+                                        int n1, 
+                                        double *in,
+                                        fftw_complex *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c_3d(int n0,
+                                        int n1,
+                                        int n2, 
+                                        double *in,
+                                        fftw_complex *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_r2c(int rank,
+                                     const int *n,
+                                     double *in,
+                                     fftw_complex *out, 
+                                     unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r_1d(int n, 
+                                        fftw_complex *in,
+                                        double *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r_2d(int n0,
+                                        int n1, 
+                                        fftw_complex *in,
+                                        double *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r_3d(int n0,
+                                        int n1,
+                                        int n2, 
+                                        fftw_complex *in,
+                                        double *out, 
+                                        unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_dft_c2r(int rank,
+                                     const int *n,
+                                     fftw_complex *in,
+                                     double *out, 
+                                     unsigned flags);
+
+
+fftw_plan CUFFTAPI fftw_plan_many_dft(int rank,
+                                      const int *n,
+                                      int batch,
+                                      fftw_complex *in,
+                                      const int *inembed, int istride, int idist,
+                                      fftw_complex *out,
+                                      const int *onembed, int ostride, int odist,
+                                      int sign, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_many_dft_r2c(int rank,
+                                          const int *n,
+                                          int batch,
+                                          double *in,
+                                          const int *inembed, int istride, int idist,
+                                          fftw_complex *out,
+                                          const int *onembed, int ostride, int odist,
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_many_dft_c2r(int rank,
+                                          const int *n,
+                                          int batch,
+                                          fftw_complex *in,
+                                          const int *inembed, int istride, int idist,
+                                          double *out,
+                                          const int *onembed, int ostride, int odist,
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru_dft(int rank, const fftw_iodim *dims,
+                                      int batch_rank, const fftw_iodim *batch_dims,
+                                      fftw_complex *in, fftw_complex *out,
+                                      int sign, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru_dft_r2c(int rank, const fftw_iodim *dims,
+                                          int batch_rank, const fftw_iodim *batch_dims,
+                                          double *in, fftw_complex *out, 
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru_dft_c2r(int rank, const fftw_iodim *dims,
+                                          int batch_rank, const fftw_iodim *batch_dims,
+                                          fftw_complex *in, double *out, 
+                                          unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims,
+                                        int batch_rank, const fftw_iodim64* batch_dims,
+                                        fftw_complex* in, fftw_complex* out,
+                                        int sign, unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru64_dft_r2c(int rank, const fftw_iodim64* dims,
+                                            int batch_rank, const fftw_iodim64* batch_dims,
+                                            double* in, fftw_complex* out,
+                                            unsigned flags);
+
+fftw_plan CUFFTAPI fftw_plan_guru64_dft_c2r(int rank, const fftw_iodim64* dims,
+                                            int batch_rank, const fftw_iodim64* batch_dims,
+                                            fftw_complex* in, double* out,
+                                            unsigned flags);
+
+void CUFFTAPI fftw_execute(const fftw_plan plan);
+
+void CUFFTAPI fftw_execute_dft(const fftw_plan plan, 
+                               fftw_complex *idata,
+                               fftw_complex *odata);
+
+void CUFFTAPI fftw_execute_dft_r2c(const fftw_plan plan, 
+                                   double *idata,
+                                   fftw_complex *odata);
+
+void CUFFTAPI fftw_execute_dft_c2r(const fftw_plan plan, 
+                                   fftw_complex *idata,
+                                   double *odata);
+
+// CUFFTW defines and supports the following single precision APIs
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_1d(int n, 
+                                      fftwf_complex *in,
+                                      fftwf_complex *out, 
+                                      int sign, 
+                                      unsigned flags);
+                                   
+fftwf_plan CUFFTAPI fftwf_plan_dft_2d(int n0,
+                                      int n1, 
+                                      fftwf_complex *in,
+                                      fftwf_complex *out, 
+                                      int sign, 
+                                      unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_3d(int n0,
+                                      int n1,
+                                      int n2, 
+                                      fftwf_complex *in,
+                                      fftwf_complex *out, 
+                                      int sign, 
+                                      unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft(int rank,
+                                   const int *n,
+                                   fftwf_complex *in,
+                                   fftwf_complex *out, 
+                                   int sign, 
+                                   unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_1d(int n, 
+                                          float *in,
+                                          fftwf_complex *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_2d(int n0,
+                                          int n1, 
+                                          float *in,
+                                          fftwf_complex *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c_3d(int n0,
+                                          int n1,
+                                          int n2, 
+                                          float *in,
+                                          fftwf_complex *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_r2c(int rank,
+                                       const int *n,
+                                       float *in,
+                                       fftwf_complex *out, 
+                                       unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_1d(int n, 
+                                          fftwf_complex *in,
+                                          float *out, 
+                                          unsigned flags);
+                                      
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_2d(int n0,
+                                          int n1, 
+                                          fftwf_complex *in,
+                                          float *out, 
+                                          unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r_3d(int n0,
+                                        int n1,
+                                        int n2, 
+                                        fftwf_complex *in,
+                                        float *out, 
+                                        unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_dft_c2r(int rank,
+                                       const int *n,
+                                       fftwf_complex *in,
+                                       float *out, 
+                                       unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_many_dft(int rank,
+                                        const int *n,
+                                        int batch,
+                                        fftwf_complex *in,
+                                        const int *inembed, int istride, int idist,
+                                        fftwf_complex *out,
+                                        const int *onembed, int ostride, int odist,
+                                        int sign, unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_many_dft_r2c(int rank,
+                                            const int *n,
+                                            int batch,
+                                            float *in,
+                                            const int *inembed, int istride, int idist,
+                                            fftwf_complex *out,
+                                            const int *onembed, int ostride, int odist,
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_many_dft_c2r(int rank,
+                                            const int *n,
+                                            int batch,
+                                            fftwf_complex *in,
+                                            const int *inembed, int istride, int idist,
+                                            float *out,
+                                            const int *onembed, int ostride, int odist,
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru_dft(int rank, const fftwf_iodim *dims,
+                                        int batch_rank, const fftwf_iodim *batch_dims,
+                                        fftwf_complex *in, fftwf_complex *out,
+                                        int sign, unsigned flags);
+                                        
+fftwf_plan CUFFTAPI fftwf_plan_guru_dft_r2c(int rank, const fftwf_iodim *dims,
+                                            int batch_rank, const fftwf_iodim *batch_dims,
+                                            float *in, fftwf_complex *out, 
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru_dft_c2r(int rank, const fftwf_iodim *dims,
+                                            int batch_rank, const fftwf_iodim *batch_dims,
+                                            fftwf_complex *in, float *out, 
+                                            unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru64_dft(int rank, const fftwf_iodim64* dims,
+                                          int batch_rank, const fftwf_iodim64* batch_dims,
+                                          fftwf_complex* in, fftwf_complex* out,
+                                          int sign, unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_r2c(int rank, const fftwf_iodim64* dims,
+                                              int batch_rank, const fftwf_iodim64* batch_dims,
+                                              float* in, fftwf_complex* out,
+                                              unsigned flags);
+
+fftwf_plan CUFFTAPI fftwf_plan_guru64_dft_c2r(int rank, const fftwf_iodim64* dims,
+                                              int batch_rank, const fftwf_iodim64* batch_dims,
+                                              fftwf_complex* in, float* out,
+                                              unsigned flags);
+
+void CUFFTAPI fftwf_execute(const fftw_plan plan);
+
+void CUFFTAPI fftwf_execute_dft(const fftwf_plan plan, 
+                                fftwf_complex *idata,
+                                fftwf_complex *odata);
+
+void CUFFTAPI fftwf_execute_dft_r2c(const fftwf_plan plan, 
+                                    float *idata,
+                                    fftwf_complex *odata);
+
+void CUFFTAPI fftwf_execute_dft_c2r(const fftwf_plan plan, 
+                                    fftwf_complex *idata,
+                                    float *odata);
+
+#ifdef _WIN32
+#define _CUFFTAPI(T) T CUFFTAPI
+#else
+#define _CUFFTAPI(T) CUFFTAPI T
+#endif
+
+// CUFFTW defines and supports the following support APIs
+
+_CUFFTAPI(void *) fftw_malloc(size_t n);
+
+_CUFFTAPI(void *) fftwf_malloc(size_t n);
+
+void CUFFTAPI fftw_free(void *pointer);
+
+void CUFFTAPI fftwf_free(void *pointer);
+
+void CUFFTAPI fftw_export_wisdom_to_file(FILE * output_file);
+
+void CUFFTAPI fftwf_export_wisdom_to_file(FILE * output_file);
+
+int CUFFTAPI fftw_import_wisdom_from_file(FILE * input_file);
+
+int CUFFTAPI fftwf_import_wisdom_from_file(FILE * input_file);
+
+void CUFFTAPI fftw_print_plan(const fftw_plan plan);                                
+
+void CUFFTAPI fftwf_print_plan(const fftwf_plan plan);
+
+void CUFFTAPI fftw_set_timelimit(double seconds);
+
+void CUFFTAPI fftwf_set_timelimit(double seconds);
+
+double CUFFTAPI fftw_cost(const fftw_plan plan);
+                               
+double CUFFTAPI fftwf_cost(const fftw_plan plan);
+
+void CUFFTAPI fftw_flops(const fftw_plan plan, double *add, double *mul, double *fma);
+
+void CUFFTAPI fftwf_flops(const fftw_plan plan, double *add, double *mul, double *fma);
+
+void CUFFTAPI fftw_destroy_plan(fftw_plan plan);
+
+void CUFFTAPI fftwf_destroy_plan(fftwf_plan plan);
+
+void CUFFTAPI fftw_cleanup(void);
+
+void CUFFTAPI fftwf_cleanup(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CUFFTW_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6d413cc6bc4ac3086636ae3df9adca0e986a777
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cufft/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libavcodec-76c43bf0.so.59.37.100 b/.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libavcodec-76c43bf0.so.59.37.100
new file mode 100644
index 0000000000000000000000000000000000000000..766de7d14113c9344f01afa717839348a505bb8f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libavcodec-76c43bf0.so.59.37.100
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a39cd4fbb6e7261c70fa98e72fa09cce7dfcd6d1fb9f74b17ad58640787b0ed
+size 13452609