BryanW commited on Mar 23

Commit

a3277d7

verified ·

1 Parent(s): 95414f3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/big_modeling.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/checkpointing.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/launchers.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/optimizer.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/ElementSoup.py +10 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/_difflib.py +2106 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/formfill.py +299 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/html5parser.py +260 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__init__.pxd +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/config.pxd +3 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/relaxng.pxd +64 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/schematron.pxd +34 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xpath.pxd +136 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__init__.py +348 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/__pycache__/compiler.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/__pycache__/driver.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/__init__.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/compiler.py +495 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/driver.c +504 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/driver.py +877 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__init__.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py +553 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.c +518 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.py +764 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGL.h +608 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling_util.h +402 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_types.h +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaVDPAU_meta.h +46 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_target.h +626 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.hpp +151 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/math.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/random.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/standard.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/target_info.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/__init__.py +26 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/__pycache__/libdevice.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__init__.py +16 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/gdc.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/libdevice.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/utils.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/gdc.py +42 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/libdevice.py +1629 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/utils.py +109 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/hip/__init__.py +5 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/__init__.cpython-312.pyc +0 -0

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.39 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/big_modeling.cpython-312.pyc ADDED Viewed

Binary file (36.9 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/checkpointing.cpython-312.pyc ADDED Viewed

Binary file (15.4 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/launchers.cpython-312.pyc ADDED Viewed

Binary file (12.9 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/accelerate/__pycache__/optimizer.cpython-312.pyc ADDED Viewed

Binary file (11.7 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/ElementSoup.py ADDED Viewed

	@@ -0,0 +1,10 @@

+__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
+"""
+__all__ = ["parse", "convert_tree"]
+from .soupparser import convert_tree, parse as _parse
+def parse(file, beautifulsoup=None, makeelement=None):
+    root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
+    return root.getroot()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/_difflib.py ADDED Viewed

	@@ -0,0 +1,2106 @@

+# Copied from CPython 3.14b2+.
+# cython: infer_types=True
+"""
+Module difflib -- helpers for computing deltas between objects.
+Function get_close_matches(word, possibilities, n=3, cutoff=0.6):
+    Use SequenceMatcher to return list of the best "good enough" matches.
+Function context_diff(a, b):
+    For two lists of strings, return a delta in context diff format.
+Function ndiff(a, b):
+    Return a delta: the difference between `a` and `b` (lists of strings).
+Function restore(delta, which):
+    Return one of the two sequences that generated an ndiff delta.
+Function unified_diff(a, b):
+    For two lists of strings, return a delta in unified diff format.
+Class SequenceMatcher:
+    A flexible class for comparing pairs of sequences of any type.
+Class Differ:
+    For producing human-readable deltas from sequences of lines of text.
+Class HtmlDiff:
+    For producing HTML side by side comparison with change highlights.
+"""
+try:
+    import cython
+except ImportError:
+    class fake_cython:
+        compiled = False
+        def cfunc(self, func): return func
+        def declare(self, _, value): return value
+        def __getattr__(self, type_name): return "object"
+    cython = fake_cython()
+__all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher',
+           'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff',
+           'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match']
+from heapq import nlargest as _nlargest
+from collections import namedtuple as _namedtuple
+try:
+    from types import GenericAlias
+except ImportError:
+    GenericAlias = None
+Match = _namedtuple('Match', 'a b size')
+def _calculate_ratio(matches, length):
+    if length:
+        return 2.0 * matches / length
+    return 1.0
+class SequenceMatcher:
+    """
+    SequenceMatcher is a flexible class for comparing pairs of sequences of
+    any type, so long as the sequence elements are hashable.  The basic
+    algorithm predates, and is a little fancier than, an algorithm
+    published in the late 1980's by Ratcliff and Obershelp under the
+    hyperbolic name "gestalt pattern matching".  The basic idea is to find
+    the longest contiguous matching subsequence that contains no "junk"
+    elements (R-O doesn't address junk).  The same idea is then applied
+    recursively to the pieces of the sequences to the left and to the right
+    of the matching subsequence.  This does not yield minimal edit
+    sequences, but does tend to yield matches that "look right" to people.
+    SequenceMatcher tries to compute a "human-friendly diff" between two
+    sequences.  Unlike e.g. UNIX(tm) diff, the fundamental notion is the
+    longest *contiguous* & junk-free matching subsequence.  That's what
+    catches peoples' eyes.  The Windows(tm) windiff has another interesting
+    notion, pairing up elements that appear uniquely in each sequence.
+    That, and the method here, appear to yield more intuitive difference
+    reports than does diff.  This method appears to be the least vulnerable
+    to syncing up on blocks of "junk lines", though (like blank lines in
+    ordinary text files, or maybe "<P>" lines in HTML files).  That may be
+    because this is the only method of the 3 that has a *concept* of
+    "junk" <wink>.
+    Example, comparing two strings, and considering blanks to be "junk":
+    >>> s = SequenceMatcher(lambda x: x == " ",
+    ...                     "private Thread currentThread;",
+    ...                     "private volatile Thread currentThread;")
+    >>>
+    .ratio() returns a float in [0, 1], measuring the "similarity" of the
+    sequences.  As a rule of thumb, a .ratio() value over 0.6 means the
+    sequences are close matches:
+    >>> print(round(s.ratio(), 3))
+    0.866
+    >>>
+    If you're only interested in where the sequences match,
+    .get_matching_blocks() is handy:
+    >>> for block in s.get_matching_blocks():
+    ...     print("a[%d] and b[%d] match for %d elements" % block)
+    a[0] and b[0] match for 8 elements
+    a[8] and b[17] match for 21 elements
+    a[29] and b[38] match for 0 elements
+    Note that the last tuple returned by .get_matching_blocks() is always a
+    dummy, (len(a), len(b), 0), and this is the only case in which the last
+    tuple element (number of elements matched) is 0.
+    If you want to know how to change the first sequence into the second,
+    use .get_opcodes():
+    >>> for opcode in s.get_opcodes():
+    ...     print("%6s a[%d:%d] b[%d:%d]" % opcode)
+     equal a[0:8] b[0:8]
+    insert a[8:8] b[8:17]
+     equal a[8:29] b[17:38]
+    See the Differ class for a fancy human-friendly file differencer, which
+    uses SequenceMatcher both to compare sequences of lines, and to compare
+    sequences of characters within similar (near-matching) lines.
+    See also function get_close_matches() in this module, which shows how
+    simple code building on SequenceMatcher can be used to do useful work.
+    Timing:  Basic R-O is cubic time worst case and quadratic time expected
+    case.  SequenceMatcher is quadratic time for the worst case and has
+    expected-case behavior dependent in a complicated way on how many
+    elements the sequences have in common; best case time is linear.
+    """
+    def __init__(self, isjunk=None, a='', b='', autojunk=True):
+        """Construct a SequenceMatcher.
+        Optional arg isjunk is None (the default), or a one-argument
+        function that takes a sequence element and returns true iff the
+        element is junk.  None is equivalent to passing "lambda x: 0", i.e.
+        no elements are considered to be junk.  For example, pass
+            lambda x: x in " \\t"
+        if you're comparing lines as sequences of characters, and don't
+        want to synch up on blanks or hard tabs.
+        Optional arg a is the first of two sequences to be compared.  By
+        default, an empty string.  The elements of a must be hashable.  See
+        also .set_seqs() and .set_seq1().
+        Optional arg b is the second of two sequences to be compared.  By
+        default, an empty string.  The elements of b must be hashable. See
+        also .set_seqs() and .set_seq2().
+        Optional arg autojunk should be set to False to disable the
+        "automatic junk heuristic" that treats popular elements as junk
+        (see module documentation for more information).
+        """
+        # Members:
+        # a
+        #      first sequence
+        # b
+        #      second sequence; differences are computed as "what do
+        #      we need to do to 'a' to change it into 'b'?"
+        # b2j
+        #      for x in b, b2j[x] is a list of the indices (into b)
+        #      at which x appears; junk and popular elements do not appear
+        # fullbcount
+        #      for x in b, fullbcount[x] == the number of times x
+        #      appears in b; only materialized if really needed (used
+        #      only for computing quick_ratio())
+        # matching_blocks
+        #      a list of (i, j, k) triples, where a[i:i+k] == b[j:j+k];
+        #      ascending & non-overlapping in i and in j; terminated by
+        #      a dummy (len(a), len(b), 0) sentinel
+        # opcodes
+        #      a list of (tag, i1, i2, j1, j2) tuples, where tag is
+        #      one of
+        #          'replace'   a[i1:i2] should be replaced by b[j1:j2]
+        #          'delete'    a[i1:i2] should be deleted
+        #          'insert'    b[j1:j2] should be inserted
+        #          'equal'     a[i1:i2] == b[j1:j2]
+        # isjunk
+        #      a user-supplied function taking a sequence element and
+        #      returning true iff the element is "junk" -- this has
+        #      subtle but helpful effects on the algorithm, which I'll
+        #      get around to writing up someday <0.9 wink>.
+        #      DON'T USE!  Only __chain_b uses this.  Use "in self.bjunk".
+        # bjunk
+        #      the items in b for which isjunk is True.
+        # bpopular
+        #      nonjunk items in b treated as junk by the heuristic (if used).
+        self.isjunk = isjunk
+        self.a = self.b = None
+        self.autojunk = autojunk
+        self.set_seqs(a, b)
+    def set_seqs(self, a, b):
+        """Set the two sequences to be compared.
+        >>> s = SequenceMatcher()
+        >>> s.set_seqs("abcd", "bcde")
+        >>> s.ratio()
+        0.75
+        """
+        self.set_seq1(a)
+        self.set_seq2(b)
+    def set_seq1(self, a):
+        """Set the first sequence to be compared.
+        The second sequence to be compared is not changed.
+        >>> s = SequenceMatcher(None, "abcd", "bcde")
+        >>> s.ratio()
+        0.75
+        >>> s.set_seq1("bcde")
+        >>> s.ratio()
+        1.0
+        >>>
+        SequenceMatcher computes and caches detailed information about the
+        second sequence, so if you want to compare one sequence S against
+        many sequences, use .set_seq2(S) once and call .set_seq1(x)
+        repeatedly for each of the other sequences.
+        See also set_seqs() and set_seq2().
+        """
+        if a is self.a:
+            return
+        self.a = a
+        self.matching_blocks = self.opcodes = None
+    def set_seq2(self, b):
+        """Set the second sequence to be compared.
+        The first sequence to be compared is not changed.
+        >>> s = SequenceMatcher(None, "abcd", "bcde")
+        >>> s.ratio()
+        0.75
+        >>> s.set_seq2("abcd")
+        >>> s.ratio()
+        1.0
+        >>>
+        SequenceMatcher computes and caches detailed information about the
+        second sequence, so if you want to compare one sequence S against
+        many sequences, use .set_seq2(S) once and call .set_seq1(x)
+        repeatedly for each of the other sequences.
+        See also set_seqs() and set_seq1().
+        """
+        if b is self.b:
+            return
+        self.b = b
+        self.matching_blocks = self.opcodes = None
+        self.fullbcount = None
+        self.__chain_b()
+    # For each element x in b, set b2j[x] to a list of the indices in
+    # b where x appears; the indices are in increasing order; note that
+    # the number of times x appears in b is len(b2j[x]) ...
+    # when self.isjunk is defined, junk elements don't show up in this
+    # map at all, which stops the central find_longest_match method
+    # from starting any matching block at a junk element ...
+    # b2j also does not contain entries for "popular" elements, meaning
+    # elements that account for more than 1 + 1% of the total elements, and
+    # when the sequence is reasonably large (>= 200 elements); this can
+    # be viewed as an adaptive notion of semi-junk, and yields an enormous
+    # speedup when, e.g., comparing program files with hundreds of
+    # instances of "return NULL;" ...
+    # note that this is only called when b changes; so for cross-product
+    # kinds of matches, it's best to call set_seq2 once, then set_seq1
+    # repeatedly
+    def __chain_b(self):
+        # Because isjunk is a user-defined (not C) function, and we test
+        # for junk a LOT, it's important to minimize the number of calls.
+        # Before the tricks described here, __chain_b was by far the most
+        # time-consuming routine in the whole module!  If anyone sees
+        # Jim Roskind, thank him again for profile.py -- I never would
+        # have guessed that.
+        # The first trick is to build b2j ignoring the possibility
+        # of junk.  I.e., we don't call isjunk at all yet.  Throwing
+        # out the junk later is much cheaper than building b2j "right"
+        # from the start.
+        b = self.b
+        self.b2j = b2j = {}
+        for i, elt in enumerate(b):
+            indices = b2j.setdefault(elt, [])
+            indices.append(i)
+        # Purge junk elements
+        self.bjunk = junk = set()
+        isjunk = self.isjunk
+        if isjunk:
+            for elt in b2j.keys():
+                if isjunk(elt):
+                    junk.add(elt)
+            for elt in junk: # separate loop avoids separate list of keys
+                del b2j[elt]
+        # Purge popular elements that are not junk
+        self.bpopular = popular = set()
+        n = len(b)
+        if self.autojunk and n >= 200:
+            ntest = n // 100 + 1
+            for elt, idxs in b2j.items():
+                if len(idxs) > ntest:
+                    popular.add(elt)
+            for elt in popular: # ditto; as fast for 1% deletion
+                del b2j[elt]
+    def find_longest_match(self, alo=0, ahi_=None, blo=0, bhi_=None):
+        """Find longest matching block in a[alo:ahi] and b[blo:bhi].
+        By default it will find the longest match in the entirety of a and b.
+        If isjunk is not defined:
+        Return (i,j,k) such that a[i:i+k] is equal to b[j:j+k], where
+            alo <= i <= i+k <= ahi
+            blo <= j <= j+k <= bhi
+        and for all (i',j',k') meeting those conditions,
+            k >= k'
+            i <= i'
+            and if i == i', j <= j'
+        In other words, of all maximal matching blocks, return one that
+        starts earliest in a, and of all those maximal matching blocks that
+        start earliest in a, return the one that starts earliest in b.
+        >>> s = SequenceMatcher(None, " abcd", "abcd abcd")
+        >>> s.find_longest_match(0, 5, 0, 9)
+        Match(a=0, b=4, size=5)
+        If isjunk is defined, first the longest matching block is
+        determined as above, but with the additional restriction that no
+        junk element appears in the block.  Then that block is extended as
+        far as possible by matching (only) junk elements on both sides.  So
+        the resulting block never matches on junk except as identical junk
+        happens to be adjacent to an "interesting" match.
+        Here's the same example as before, but considering blanks to be
+        junk.  That prevents " abcd" from matching the " abcd" at the tail
+        end of the second sequence directly.  Instead only the "abcd" can
+        match, and matches the leftmost "abcd" in the second sequence:
+        >>> s = SequenceMatcher(lambda x: x==" ", " abcd", "abcd abcd")
+        >>> s.find_longest_match(0, 5, 0, 9)
+        Match(a=1, b=0, size=4)
+        If no blocks match, return (alo, blo, 0).
+        >>> s = SequenceMatcher(None, "ab", "c")
+        >>> s.find_longest_match(0, 2, 0, 1)
+        Match(a=0, b=0, size=0)
+        """
+        # CAUTION:  stripping common prefix or suffix would be incorrect.
+        # E.g.,
+        #    ab
+        #    acab
+        # Longest matching block is "ab", but if common prefix is
+        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so
+        # strip, so ends up claiming that ab is changed to acab by
+        # inserting "ca" in the middle.  That's minimal but unintuitive:
+        # "it's obvious" that someone inserted "ac" at the front.
+        # Windiff ends up at the same place as diff, but by pairing up
+        # the unique 'b's and then matching the first two 'a's.
+        bjunk: set = self.bjunk
+        a, b, b2j = self.a, self.b, self.b2j
+        ahi = len(a) if ahi_ is None else ahi_
+        bhi = len(b) if bhi_ is None else bhi_
+        besti, bestj, bestsize = alo, blo, 0
+        # find longest junk-free match
+        # during an iteration of the loop, j2len[j] = length of longest
+        # junk-free match ending with a[i-1] and b[j]
+        j2len = {}
+        nothing = []
+        for i in range(alo, ahi):
+            # look at all instances of a[i] in b; note that because
+            # b2j has no junk keys, the loop is skipped if a[i] is junk
+            newj2len = {}
+            for j in b2j.get(a[i], nothing):
+                # a[i] matches b[j]
+                if j < blo:
+                    continue
+                if j >= bhi:
+                    break
+                k = newj2len[j] = j2len.get(j-1, 0) + 1
+                if k > bestsize:
+                    besti, bestj, bestsize = i-k+1, j-k+1, k
+            j2len = newj2len
+        # Extend the best by non-junk elements on each end.  In particular,
+        # "popular" non-junk elements aren't in b2j, which greatly speeds
+        # the inner loop above, but also means "the best" match so far
+        # doesn't contain any junk *or* popular non-junk elements.
+        while besti > alo and bestj > blo and \
+              b[bestj-1] not in bjunk and \
+              a[besti-1] == b[bestj-1]:
+            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+        while besti+bestsize < ahi and bestj+bestsize < bhi and \
+              b[bestj+bestsize] not in bjunk and \
+              a[besti+bestsize] == b[bestj+bestsize]:
+            bestsize += 1
+        # Now that we have a wholly interesting match (albeit possibly
+        # empty!), we may as well suck up the matching junk on each
+        # side of it too.  Can't think of a good reason not to, and it
+        # saves post-processing the (possibly considerable) expense of
+        # figuring out what to do with it.  In the case of an empty
+        # interesting match, this is clearly the right thing to do,
+        # because no other kind of match is possible in the regions.
+        while besti > alo and bestj > blo and \
+              b[bestj-1] in bjunk and \
+              a[besti-1] == b[bestj-1]:
+            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1
+        while besti+bestsize < ahi and bestj+bestsize < bhi and \
+              b[bestj+bestsize] in bjunk and \
+              a[besti+bestsize] == b[bestj+bestsize]:
+            bestsize = bestsize + 1
+        return Match(besti, bestj, bestsize)
+    def get_matching_blocks(self):
+        """Return list of triples describing matching subsequences.
+        Each triple is of the form (i, j, n), and means that
+        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in
+        i and in j.  New in Python 2.5, it's also guaranteed that if
+        (i, j, n) and (i', j', n') are adjacent triples in the list, and
+        the second is not the last triple in the list, then i+n != i' or
+        j+n != j'.  IOW, adjacent triples never describe adjacent equal
+        blocks.
+        The last triple is a dummy, (len(a), len(b), 0), and is the only
+        triple with n==0.
+        >>> s = SequenceMatcher(None, "abxcd", "abcd")
+        >>> list(s.get_matching_blocks())
+        [Match(a=0, b=0, size=2), Match(a=3, b=2, size=2), Match(a=5, b=4, size=0)]
+        """
+        if self.matching_blocks is not None:
+            return self.matching_blocks
+        la, lb = len(self.a), len(self.b)
+        # This is most naturally expressed as a recursive algorithm, but
+        # at least one user bumped into extreme use cases that exceeded
+        # the recursion limit on their box.  So, now we maintain a list
+        # ('queue`) of blocks we still need to look at, and append partial
+        # results to `matching_blocks` in a loop; the matches are sorted
+        # at the end.
+        queue = [(0, la, 0, lb)]
+        matching_blocks = []
+        while queue:
+            alo, ahi, blo, bhi = queue.pop()
+            i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)
+            # a[alo:i] vs b[blo:j] unknown
+            # a[i:i+k] same as b[j:j+k]
+            # a[i+k:ahi] vs b[j+k:bhi] unknown
+            if k:   # if k is 0, there was no matching block
+                matching_blocks.append(x)
+                if alo < i and blo < j:
+                    queue.append((alo, i, blo, j))
+                if i+k < ahi and j+k < bhi:
+                    queue.append((i+k, ahi, j+k, bhi))
+        matching_blocks.sort()
+        # It's possible that we have adjacent equal blocks in the
+        # matching_blocks list now.  Starting with 2.5, this code was added
+        # to collapse them.
+        i1 = j1 = k1 = 0
+        non_adjacent = []
+        for i2, j2, k2 in matching_blocks:
+            # Is this block adjacent to i1, j1, k1?
+            if i1 + k1 == i2 and j1 + k1 == j2:
+                # Yes, so collapse them -- this just increases the length of
+                # the first block by the length of the second, and the first
+                # block so lengthened remains the block to compare against.
+                k1 += k2
+            else:
+                # Not adjacent.  Remember the first block (k1==0 means it's
+                # the dummy we started with), and make the second block the
+                # new block to compare against.
+                if k1:
+                    non_adjacent.append((i1, j1, k1))
+                i1, j1, k1 = i2, j2, k2
+        if k1:
+            non_adjacent.append((i1, j1, k1))
+        non_adjacent.append( (la, lb, 0) )
+        self.matching_blocks = list(map(Match._make, non_adjacent))
+        return self.matching_blocks
+    def get_opcodes(self):
+        """Return list of 5-tuples describing how to turn a into b.
+        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple
+        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the
+        tuple preceding it, and likewise for j1 == the previous j2.
+        The tags are strings, with these meanings:
+        'replace':  a[i1:i2] should be replaced by b[j1:j2]
+        'delete':   a[i1:i2] should be deleted.
+                    Note that j1==j2 in this case.
+        'insert':   b[j1:j2] should be inserted at a[i1:i1].
+                    Note that i1==i2 in this case.
+        'equal':    a[i1:i2] == b[j1:j2]
+        >>> a = "qabxcd"
+        >>> b = "abycdf"
+        >>> s = SequenceMatcher(None, a, b)
+        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():
+        ...    print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
+        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
+         delete a[0:1] (q) b[0:0] ()
+          equal a[1:3] (ab) b[0:2] (ab)
+        replace a[3:4] (x) b[2:3] (y)
+          equal a[4:6] (cd) b[3:5] (cd)
+         insert a[6:6] () b[5:6] (f)
+        """
+        if self.opcodes is not None:
+            return self.opcodes
+        i = j = 0
+        self.opcodes = answer = []
+        for ai, bj, size in self.get_matching_blocks():
+            # invariant:  we've pumped out correct diffs to change
+            # a[:i] into b[:j], and the next matching block is
+            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump
+            # out a diff to change a[i:ai] into b[j:bj], pump out
+            # the matching block, and move (i,j) beyond the match
+            tag = ''
+            if i < ai and j < bj:
+                tag = 'replace'
+            elif i < ai:
+                tag = 'delete'
+            elif j < bj:
+                tag = 'insert'
+            if tag:
+                answer.append( (tag, i, ai, j, bj) )
+            i, j = ai+size, bj+size
+            # the list of matching blocks is terminated by a
+            # sentinel with size 0
+            if size:
+                answer.append( ('equal', ai, i, bj, j) )
+        return answer
+    def get_grouped_opcodes(self, n=3):
+        """ Isolate change clusters by eliminating ranges with no changes.
+        Return a generator of groups with up to n lines of context.
+        Each group is in the same format as returned by get_opcodes().
+        >>> from pprint import pprint
+        >>> a = list(map(str, range(1,40)))
+        >>> b = a[:]
+        >>> b[8:8] = ['i']     # Make an insertion
+        >>> b[20] += 'x'       # Make a replacement
+        >>> b[23:28] = []      # Make a deletion
+        >>> b[30] += 'y'       # Make another replacement
+        >>> pprint(list(SequenceMatcher(None,a,b).get_grouped_opcodes()))
+        [[('equal', 5, 8, 5, 8), ('insert', 8, 8, 8, 9), ('equal', 8, 11, 9, 12)],
+         [('equal', 16, 19, 17, 20),
+          ('replace', 19, 20, 20, 21),
+          ('equal', 20, 22, 21, 23),
+          ('delete', 22, 27, 23, 23),
+          ('equal', 27, 30, 23, 26)],
+         [('equal', 31, 34, 27, 30),
+          ('replace', 34, 35, 30, 31),
+          ('equal', 35, 38, 31, 34)]]
+        """
+        codes = self.get_opcodes()
+        if not codes:
+            codes = [("equal", 0, 1, 0, 1)]
+        # Fixup leading and trailing groups if they show no changes.
+        if codes[0][0] == 'equal':
+            tag, i1, i2, j1, j2 = codes[0]
+            codes[0] = tag, max(i1, i2-n), i2, max(j1, j2-n), j2
+        if codes[-1][0] == 'equal':
+            tag, i1, i2, j1, j2 = codes[-1]
+            codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n)
+        nn = n + n
+        group = []
+        for tag, i1, i2, j1, j2 in codes:
+            # End the current group and start a new one whenever
+            # there is a large range with no changes.
+            if tag == 'equal' and i2-i1 > nn:
+                group.append((tag, i1, min(i2, i1+n), j1, min(j2, j1+n)))
+                yield group
+                group = []
+                i1, j1 = max(i1, i2-n), max(j1, j2-n)
+            group.append((tag, i1, i2, j1 ,j2))
+        if group and not (len(group)==1 and group[0][0] == 'equal'):
+            yield group
+    def ratio(self):
+        """Return a measure of the sequences' similarity (float in [0,1]).
+        Where T is the total number of elements in both sequences, and
+        M is the number of matches, this is 2.0*M / T.
+        Note that this is 1 if the sequences are identical, and 0 if
+        they have nothing in common.
+        .ratio() is expensive to compute if you haven't already computed
+        .get_matching_blocks() or .get_opcodes(), in which case you may
+        want to try .quick_ratio() or .real_quick_ratio() first to get an
+        upper bound.
+        >>> s = SequenceMatcher(None, "abcd", "bcde")
+        >>> s.ratio()
+        0.75
+        >>> s.quick_ratio()
+        0.75
+        >>> s.real_quick_ratio()
+        1.0
+        """
+        matches: cython.Py_ssize_t
+        matches = sum(triple[-1] for triple in self.get_matching_blocks())
+        return _calculate_ratio(matches, len(self.a) + len(self.b))
+    def quick_ratio(self):
+        """Return an upper bound on ratio() relatively quickly.
+        This isn't defined beyond that it is an upper bound on .ratio(), and
+        is faster to compute.
+        """
+        # viewing a and b as multisets, set matches to the cardinality
+        # of their intersection; this counts the number of matches
+        # without regard to order, so is clearly an upper bound
+        if self.fullbcount is None:
+            self.fullbcount = fullbcount = {}
+            for elt in self.b:
+                fullbcount[elt] = fullbcount.get(elt, 0) + 1
+        fullbcount = self.fullbcount
+        # avail[x] is the number of times x appears in 'b' less the
+        # number of times we've seen it in 'a' so far ... kinda
+        avail = {}
+        matches: cython.Py_ssize_t
+        matches = 0
+        for elt in self.a:
+            if elt in avail:
+                numb = avail[elt]
+            else:
+                numb = fullbcount.get(elt, 0)
+            avail[elt] = numb - 1
+            if numb > 0:
+                matches = matches + 1
+        return _calculate_ratio(matches, len(self.a) + len(self.b))
+    def real_quick_ratio(self):
+        """Return an upper bound on ratio() very quickly.
+        This isn't defined beyond that it is an upper bound on .ratio(), and
+        is faster to compute than either .ratio() or .quick_ratio().
+        """
+        la, lb = len(self.a), len(self.b)
+        # can't have more matches than the number of elements in the
+        # shorter sequence
+        return _calculate_ratio(min(la, lb), la + lb)
+    if GenericAlias is not None:
+        __class_getitem__ = classmethod(GenericAlias)
+def get_close_matches(word, possibilities, n=3, cutoff=0.6):
+    """Use SequenceMatcher to return list of the best "good enough" matches.
+    word is a sequence for which close matches are desired (typically a
+    string).
+    possibilities is a list of sequences against which to match word
+    (typically a list of strings).
+    Optional arg n (default 3) is the maximum number of close matches to
+    return.  n must be > 0.
+    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities
+    that don't score at least that similar to word are ignored.
+    The best (no more than n) matches among the possibilities are returned
+    in a list, sorted by similarity score, most similar first.
+    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])
+    ['apple', 'ape']
+    >>> import keyword as _keyword
+    >>> get_close_matches("wheel", _keyword.kwlist)
+    ['while']
+    >>> get_close_matches("Apple", _keyword.kwlist)
+    []
+    >>> get_close_matches("accept", _keyword.kwlist)
+    ['except']
+    """
+    if not n >  0:
+        raise ValueError("n must be > 0: %r" % (n,))
+    if not 0.0 <= cutoff <= 1.0:
+        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
+    result = []
+    s = SequenceMatcher()
+    s.set_seq2(word)
+    for x in possibilities:
+        s.set_seq1(x)
+        if s.real_quick_ratio() >= cutoff and \
+           s.quick_ratio() >= cutoff and \
+           s.ratio() >= cutoff:
+            result.append((s.ratio(), x))
+    # Move the best scorers to head of list
+    result = _nlargest(n, result)
+    # Strip scores for the best n matches
+    return [x for score, x in result]
+def _keep_original_ws(s, tag_s):
+    """Replace whitespace with the original whitespace characters in `s`"""
+    return ''.join(
+        c if tag_c == " " and c.isspace() else tag_c
+        for c, tag_c in zip(s, tag_s)
+    )
+class Differ:
+    r"""
+    Differ is a class for comparing sequences of lines of text, and
+    producing human-readable differences or deltas.  Differ uses
+    SequenceMatcher both to compare sequences of lines, and to compare
+    sequences of characters within similar (near-matching) lines.
+    Each line of a Differ delta begins with a two-letter code:
+        '- '    line unique to sequence 1
+        '+ '    line unique to sequence 2
+        '  '    line common to both sequences
+        '? '    line not present in either input sequence
+    Lines beginning with '? ' attempt to guide the eye to intraline
+    differences, and were not present in either input sequence.  These lines
+    can be confusing if the sequences contain tab characters.
+    Note that Differ makes no claim to produce a *minimal* diff.  To the
+    contrary, minimal diffs are often counter-intuitive, because they synch
+    up anywhere possible, sometimes accidental matches 100 pages apart.
+    Restricting synch points to contiguous matches preserves some notion of
+    locality, at the occasional cost of producing a longer diff.
+    Example: Comparing two texts.
+    First we set up the texts, sequences of individual single-line strings
+    ending with newlines (such sequences can also be obtained from the
+    `readlines()` method of file-like objects):
+    >>> text1 = '''  1. Beautiful is better than ugly.
+    ...   2. Explicit is better than implicit.
+    ...   3. Simple is better than complex.
+    ...   4. Complex is better than complicated.
+    ... '''.splitlines(keepends=True)
+    >>> len(text1)
+    4
+    >>> text1[0][-1]
+    '\n'
+    >>> text2 = '''  1. Beautiful is better than ugly.
+    ...   3.   Simple is better than complex.
+    ...   4. Complicated is better than complex.
+    ...   5. Flat is better than nested.
+    ... '''.splitlines(keepends=True)
+    Next we instantiate a Differ object:
+    >>> d = Differ()
+    Note that when instantiating a Differ object we may pass functions to
+    filter out line and character 'junk'.  See Differ.__init__ for details.
+    Finally, we compare the two:
+    >>> result = list(d.compare(text1, text2))
+    'result' is a list of strings, so let's pretty-print it:
+    >>> from pprint import pprint as _pprint
+    >>> _pprint(result)
+    ['    1. Beautiful is better than ugly.\n',
+     '-   2. Explicit is better than implicit.\n',
+     '-   3. Simple is better than complex.\n',
+     '+   3.   Simple is better than complex.\n',
+     '?     ++\n',
+     '-   4. Complex is better than complicated.\n',
+     '?            ^                     ---- ^\n',
+     '+   4. Complicated is better than complex.\n',
+     '?           ++++ ^                      ^\n',
+     '+   5. Flat is better than nested.\n']
+    As a single multi-line string it looks like this:
+    >>> print(''.join(result), end="")
+        1. Beautiful is better than ugly.
+    -   2. Explicit is better than implicit.
+    -   3. Simple is better than complex.
+    +   3.   Simple is better than complex.
+    ?     ++
+    -   4. Complex is better than complicated.
+    ?            ^                     ---- ^
+    +   4. Complicated is better than complex.
+    ?           ++++ ^                      ^
+    +   5. Flat is better than nested.
+    """
+    def __init__(self, linejunk=None, charjunk=None):
+        """
+        Construct a text differencer, with optional filters.
+        The two optional keyword parameters are for filter functions:
+        - `linejunk`: A function that should accept a single string argument,
+          and return true iff the string is junk. The module-level function
+          `IS_LINE_JUNK` may be used to filter out lines without visible
+          characters, except for at most one splat ('#').  It is recommended
+          to leave linejunk None; the underlying SequenceMatcher class has
+          an adaptive notion of "noise" lines that's better than any static
+          definition the author has ever been able to craft.
+        - `charjunk`: A function that should accept a string of length 1. The
+          module-level function `IS_CHARACTER_JUNK` may be used to filter out
+          whitespace characters (a blank or tab; **note**: bad idea to include
+          newline in this!).  Use of IS_CHARACTER_JUNK is recommended.
+        """
+        self.linejunk = linejunk
+        self.charjunk = charjunk
+    def compare(self, a, b):
+        r"""
+        Compare two sequences of lines; generate the resulting delta.
+        Each sequence must contain individual single-line strings ending with
+        newlines. Such sequences can be obtained from the `readlines()` method
+        of file-like objects.  The delta generated also consists of newline-
+        terminated strings, ready to be printed as-is via the writelines()
+        method of a file-like object.
+        Example:
+        >>> print(''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(True),
+        ...                                'ore\ntree\nemu\n'.splitlines(True))),
+        ...       end="")
+        - one
+        ?  ^
+        + ore
+        ?  ^
+        - two
+        - three
+        ?  -
+        + tree
+        + emu
+        """
+        cruncher = SequenceMatcher(self.linejunk, a, b)
+        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
+            if tag == 'replace':
+                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)
+            elif tag == 'delete':
+                g = self._dump('-', a, alo, ahi)
+            elif tag == 'insert':
+                g = self._dump('+', b, blo, bhi)
+            elif tag == 'equal':
+                g = self._dump(' ', a, alo, ahi)
+            else:
+                raise ValueError('unknown tag %r' % (tag,))
+            yield from g
+    def _dump(self, tag, x, lo, hi):
+        """Generate comparison results for a same-tagged range."""
+        for i in range(lo, hi):
+            yield '%s %s' % (tag, x[i])
+    def _plain_replace(self, a, alo, ahi, b, blo, bhi):
+        assert alo < ahi and blo < bhi
+        # dump the shorter block first -- reduces the burden on short-term
+        # memory if the blocks are of very different sizes
+        if bhi - blo < ahi - alo:
+            first  = self._dump('+', b, blo, bhi)
+            second = self._dump('-', a, alo, ahi)
+        else:
+            first  = self._dump('-', a, alo, ahi)
+            second = self._dump('+', b, blo, bhi)
+        for g in first, second:
+            yield from g
+    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):
+        r"""
+        When replacing one block of lines with another, search the blocks
+        for *similar* lines; the best-matching pair (if any) is used as a
+        synch point, and intraline difference marking is done on the
+        similar pair. Lots of work, but often worth it.
+        Example:
+        >>> d = Differ()
+        >>> results = d._fancy_replace(['abcDefghiJkl\n'], 0, 1,
+        ...                            ['abcdefGhijkl\n'], 0, 1)
+        >>> print(''.join(results), end="")
+        - abcDefghiJkl
+        ?    ^  ^  ^
+        + abcdefGhijkl
+        ?    ^  ^  ^
+        """
+        # Don't synch up unless the lines have a similarity score above
+        # cutoff. Previously only the smallest pair was handled here,
+        # and if there are many pairs with the best ratio, recursion
+        # could grow very deep, and runtime cubic. See:
+        # https://github.com/python/cpython/issues/119105
+        #
+        # Later, more pathological cases prompted removing recursion
+        # entirely.
+        cutoff = 0.74999
+        cruncher = SequenceMatcher(self.charjunk)
+        crqr = cruncher.real_quick_ratio
+        cqr = cruncher.quick_ratio
+        cr = cruncher.ratio
+        WINDOW = 10
+        best_i = best_j = None
+        dump_i, dump_j = alo, blo # smallest indices not yet resolved
+        for j in range(blo, bhi):
+            cruncher.set_seq2(b[j])
+            # Search the corresponding i's within WINDOW for rhe highest
+            # ratio greater than `cutoff`.
+            aequiv = alo + (j - blo)
+            arange = range(max(aequiv - WINDOW, dump_i),
+                           min(aequiv + WINDOW + 1, ahi))
+            if not arange: # likely exit if `a` is shorter than `b`
+                break
+            best_ratio = cutoff
+            for i in arange:
+                cruncher.set_seq1(a[i])
+                # Ordering by cheapest to most expensive ratio is very
+                # valuable, most often getting out early.
+                if (crqr() > best_ratio
+                      and cqr() > best_ratio
+                      and cr() > best_ratio):
+                    best_i, best_j, best_ratio = i, j, cr()
+            if best_i is None:
+                # found nothing to synch on yet - move to next j
+                continue
+            # pump out straight replace from before this synch pair
+            yield from self._fancy_helper(a, dump_i, best_i,
+                                          b, dump_j, best_j)
+            # do intraline marking on the synch pair
+            aelt, belt = a[best_i], b[best_j]
+            if aelt != belt:
+                # pump out a '-', '?', '+', '?' quad for the synched lines
+                atags = btags = ""
+                cruncher.set_seqs(aelt, belt)
+                for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():
+                    la, lb = ai2 - ai1, bj2 - bj1
+                    if tag == 'replace':
+                        atags += '^' * la
+                        btags += '^' * lb
+                    elif tag == 'delete':
+                        atags += '-' * la
+                    elif tag == 'insert':
+                        btags += '+' * lb
+                    elif tag == 'equal':
+                        atags += ' ' * la
+                        btags += ' ' * lb
+                    else:
+                        raise ValueError('unknown tag %r' % (tag,))
+                yield from self._qformat(aelt, belt, atags, btags)
+            else:
+                # the synch pair is identical
+                yield '  ' + aelt
+            dump_i, dump_j = best_i + 1, best_j + 1
+            best_i = best_j = None
+        # pump out straight replace from after the last synch pair
+        yield from self._fancy_helper(a, dump_i, ahi,
+                                      b, dump_j, bhi)
+    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):
+        g = []
+        if alo < ahi:
+            if blo < bhi:
+                g = self._plain_replace(a, alo, ahi, b, blo, bhi)
+            else:
+                g = self._dump('-', a, alo, ahi)
+        elif blo < bhi:
+            g = self._dump('+', b, blo, bhi)
+        yield from g
+    def _qformat(self, aline, bline, atags, btags):
+        r"""
+        Format "?" output and deal with tabs.
+        Example:
+        >>> d = Differ()
+        >>> results = d._qformat('\tabcDefghiJkl\n', '\tabcdefGhijkl\n',
+        ...                      '  ^ ^  ^      ', '  ^ ^  ^      ')
+        >>> for line in results: print(repr(line))
+        ...
+        '- \tabcDefghiJkl\n'
+        '? \t ^ ^  ^\n'
+        '+ \tabcdefGhijkl\n'
+        '? \t ^ ^  ^\n'
+        """
+        atags = _keep_original_ws(aline, atags).rstrip()
+        btags = _keep_original_ws(bline, btags).rstrip()
+        yield "- " + aline
+        if atags:
+            yield f"? {atags}\n"
+        yield "+ " + bline
+        if btags:
+            yield f"? {btags}\n"
+# With respect to junk, an earlier version of ndiff simply refused to
+# *start* a match with a junk element.  The result was cases like this:
+#     before: private Thread currentThread;
+#     after:  private volatile Thread currentThread;
+# If you consider whitespace to be junk, the longest contiguous match
+# not starting with junk is "e Thread currentThread".  So ndiff reported
+# that "e volatil" was inserted between the 't' and the 'e' in "private".
+# While an accurate view, to people that's absurd.  The current version
+# looks for matching blocks that are entirely junk-free, then extends the
+# longest one of those as far as possible but only with matching junk.
+# So now "currentThread" is matched, then extended to suck up the
+# preceding blank; then "private" is matched, and extended to suck up the
+# following blank; then "Thread" is matched; and finally ndiff reports
+# that "volatile " was inserted before "Thread".  The only quibble
+# remaining is that perhaps it was really the case that " volatile"
+# was inserted after "private".  I can live with that <wink>.
+def IS_LINE_JUNK(line, pat=None):
+    r"""
+    Return True for ignorable line: if `line` is blank or contains a single '#'.
+    Examples:
+    >>> IS_LINE_JUNK('\n')
+    True
+    >>> IS_LINE_JUNK('  #   \n')
+    True
+    >>> IS_LINE_JUNK('hello\n')
+    False
+    """
+    if pat is None:
+        # Default: match '#' or the empty string
+        return line.strip() in '#'
+   # Previous versions used the undocumented parameter 'pat' as a
+   # match function. Retain this behaviour for compatibility.
+    return pat(line) is not None
+def IS_CHARACTER_JUNK(ch, ws=" \t"):
+    r"""
+    Return True for ignorable character: iff `ch` is a space or tab.
+    Examples:
+    >>> IS_CHARACTER_JUNK(' ')
+    True
+    >>> IS_CHARACTER_JUNK('\t')
+    True
+    >>> IS_CHARACTER_JUNK('\n')
+    False
+    >>> IS_CHARACTER_JUNK('x')
+    False
+    """
+    return ch in ws
+########################################################################
+###  Unified Diff
+########################################################################
+def _format_range_unified(start, stop):
+    'Convert range to the "ed" format'
+    # Per the diff spec at http://www.unix.org/single_unix_specification/
+    beginning = start + 1     # lines start numbering with one
+    length = stop - start
+    if length == 1:
+        return '{}'.format(beginning)
+    if not length:
+        beginning -= 1        # empty ranges begin at line just before the range
+    return '{},{}'.format(beginning, length)
+def unified_diff(a, b, fromfile='', tofile='', fromfiledate='',
+                 tofiledate='', n=3, lineterm='\n'):
+    r"""
+    Compare two sequences of lines; generate the delta as a unified diff.
+    Unified diffs are a compact way of showing line changes and a few
+    lines of context.  The number of context lines is set by 'n' which
+    defaults to three.
+    By default, the diff control lines (those with ---, +++, or @@) are
+    created with a trailing newline.  This is helpful so that inputs
+    created from file.readlines() result in diffs that are suitable for
+    file.writelines() since both the inputs and outputs have trailing
+    newlines.
+    For inputs that do not have trailing newlines, set the lineterm
+    argument to "" so that the output will be uniformly newline free.
+    The unidiff format normally has a header for filenames and modification
+    times.  Any or all of these may be specified using strings for
+    'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
+    The modification times are normally expressed in the ISO 8601 format.
+    Example:
+    >>> for line in unified_diff('one two three four'.split(),
+    ...             'zero one tree four'.split(), 'Original', 'Current',
+    ...             '2005-01-26 23:30:50', '2010-04-02 10:20:52',
+    ...             lineterm=''):
+    ...     print(line)                 # doctest: +NORMALIZE_WHITESPACE
+    --- Original        2005-01-26 23:30:50
+    +++ Current         2010-04-02 10:20:52
+    @@ -1,4 +1,4 @@
+    +zero
+     one
+    -two
+    -three
+    +tree
+     four
+    """
+    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
+    started = False
+    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
+        if not started:
+            started = True
+            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
+            todate = '\t{}'.format(tofiledate) if tofiledate else ''
+            yield '--- {}{}{}'.format(fromfile, fromdate, lineterm)
+            yield '+++ {}{}{}'.format(tofile, todate, lineterm)
+        first, last = group[0], group[-1]
+        file1_range = _format_range_unified(first[1], last[2])
+        file2_range = _format_range_unified(first[3], last[4])
+        yield '@@ -{} +{} @@{}'.format(file1_range, file2_range, lineterm)
+        for tag, i1, i2, j1, j2 in group:
+            if tag == 'equal':
+                for line in a[i1:i2]:
+                    yield ' ' + line
+                continue
+            if tag in {'replace', 'delete'}:
+                for line in a[i1:i2]:
+                    yield '-' + line
+            if tag in {'replace', 'insert'}:
+                for line in b[j1:j2]:
+                    yield '+' + line
+########################################################################
+###  Context Diff
+########################################################################
+def _format_range_context(start, stop):
+    'Convert range to the "ed" format'
+    # Per the diff spec at http://www.unix.org/single_unix_specification/
+    beginning = start + 1     # lines start numbering with one
+    length = stop - start
+    if not length:
+        beginning -= 1        # empty ranges begin at line just before the range
+    if length <= 1:
+        return '{}'.format(beginning)
+    return '{},{}'.format(beginning, beginning + length - 1)
+# See http://www.unix.org/single_unix_specification/
+def context_diff(a, b, fromfile='', tofile='',
+                 fromfiledate='', tofiledate='', n=3, lineterm='\n'):
+    r"""
+    Compare two sequences of lines; generate the delta as a context diff.
+    Context diffs are a compact way of showing line changes and a few
+    lines of context.  The number of context lines is set by 'n' which
+    defaults to three.
+    By default, the diff control lines (those with *** or ---) are
+    created with a trailing newline.  This is helpful so that inputs
+    created from file.readlines() result in diffs that are suitable for
+    file.writelines() since both the inputs and outputs have trailing
+    newlines.
+    For inputs that do not have trailing newlines, set the lineterm
+    argument to "" so that the output will be uniformly newline free.
+    The context diff format normally has a header for filenames and
+    modification times.  Any or all of these may be specified using
+    strings for 'fromfile', 'tofile', 'fromfiledate', and 'tofiledate'.
+    The modification times are normally expressed in the ISO 8601 format.
+    If not specified, the strings default to blanks.
+    Example:
+    >>> print(''.join(context_diff('one\ntwo\nthree\nfour\n'.splitlines(True),
+    ...       'zero\none\ntree\nfour\n'.splitlines(True), 'Original', 'Current')),
+    ...       end="")
+    *** Original
+    --- Current
+    ***************
+    *** 1,4 ****
+      one
+    ! two
+    ! three
+      four
+    --- 1,4 ----
+    + zero
+      one
+    ! tree
+      four
+    """
+    _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm)
+    prefix = dict(insert='+ ', delete='- ', replace='! ', equal='  ')
+    started = False
+    for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n):
+        if not started:
+            started = True
+            fromdate = '\t{}'.format(fromfiledate) if fromfiledate else ''
+            todate = '\t{}'.format(tofiledate) if tofiledate else ''
+            yield '*** {}{}{}'.format(fromfile, fromdate, lineterm)
+            yield '--- {}{}{}'.format(tofile, todate, lineterm)
+        first, last = group[0], group[-1]
+        yield '***************' + lineterm
+        file1_range = _format_range_context(first[1], last[2])
+        yield '*** {} ****{}'.format(file1_range, lineterm)
+        if any(tag in {'replace', 'delete'} for tag, _, _, _, _ in group):
+            for tag, i1, i2, _, _ in group:
+                if tag != 'insert':
+                    for line in a[i1:i2]:
+                        yield prefix[tag] + line
+        file2_range = _format_range_context(first[3], last[4])
+        yield '--- {} ----{}'.format(file2_range, lineterm)
+        if any(tag in {'replace', 'insert'} for tag, _, _, _, _ in group):
+            for tag, _, _, j1, j2 in group:
+                if tag != 'delete':
+                    for line in b[j1:j2]:
+                        yield prefix[tag] + line
+def _check_types(a, b, *args):
+    # Checking types is weird, but the alternative is garbled output when
+    # someone passes mixed bytes and str to {unified,context}_diff(). E.g.
+    # without this check, passing filenames as bytes results in output like
+    #   --- b'oldfile.txt'
+    #   +++ b'newfile.txt'
+    # because of how str.format() incorporates bytes objects.
+    if a and not isinstance(a[0], str):
+        raise TypeError('lines to compare must be str, not %s (%r)' %
+                        (type(a[0]).__name__, a[0]))
+    if b and not isinstance(b[0], str):
+        raise TypeError('lines to compare must be str, not %s (%r)' %
+                        (type(b[0]).__name__, b[0]))
+    if isinstance(a, str):
+        raise TypeError('input must be a sequence of strings, not %s' %
+                        type(a).__name__)
+    if isinstance(b, str):
+        raise TypeError('input must be a sequence of strings, not %s' %
+                        type(b).__name__)
+    for arg in args:
+        if not isinstance(arg, str):
+            raise TypeError('all arguments must be str, not: %r' % (arg,))
+def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'',
+               fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'):
+    r"""
+    Compare `a` and `b`, two sequences of lines represented as bytes rather
+    than str. This is a wrapper for `dfunc`, which is typically either
+    unified_diff() or context_diff(). Inputs are losslessly converted to
+    strings so that `dfunc` only has to worry about strings, and encoded
+    back to bytes on return. This is necessary to compare files with
+    unknown or inconsistent encoding. All other inputs (except `n`) must be
+    bytes rather than str.
+    """
+    def decode(s):
+        try:
+            return s.decode('ascii', 'surrogateescape')
+        except AttributeError as err:
+            msg = ('all arguments must be bytes, not %s (%r)' %
+                   (type(s).__name__, s))
+            raise TypeError(msg) from err
+    a = list(map(decode, a))
+    b = list(map(decode, b))
+    fromfile = decode(fromfile)
+    tofile = decode(tofile)
+    fromfiledate = decode(fromfiledate)
+    tofiledate = decode(tofiledate)
+    lineterm = decode(lineterm)
+    lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm)
+    for line in lines:
+        yield line.encode('ascii', 'surrogateescape')
+def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK):
+    r"""
+    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.
+    Optional keyword parameters `linejunk` and `charjunk` are for filter
+    functions, or can be None:
+    - linejunk: A function that should accept a single string argument and
+      return true iff the string is junk.  The default is None, and is
+      recommended; the underlying SequenceMatcher class has an adaptive
+      notion of "noise" lines.
+    - charjunk: A function that accepts a character (string of length
+      1), and returns true iff the character is junk. The default is
+      the module-level function IS_CHARACTER_JUNK, which filters out
+      whitespace characters (a blank or tab; note: it's a bad idea to
+      include newline in this!).
+    Tools/scripts/ndiff.py is a command-line front-end to this function.
+    Example:
+    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
+    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
+    >>> print(''.join(diff), end="")
+    - one
+    ?  ^
+    + ore
+    ?  ^
+    - two
+    - three
+    ?  -
+    + tree
+    + emu
+    """
+    return Differ(linejunk, charjunk).compare(a, b)
+def _mdiff(fromlines, tolines, context=None, linejunk=None,
+           charjunk=IS_CHARACTER_JUNK):
+    r"""Returns generator yielding marked up from/to side by side differences.
+    Arguments:
+    fromlines -- list of text lines to compared to tolines
+    tolines -- list of text lines to be compared to fromlines
+    context -- number of context lines to display on each side of difference,
+               if None, all from/to text lines will be generated.
+    linejunk -- passed on to ndiff (see ndiff documentation)
+    charjunk -- passed on to ndiff (see ndiff documentation)
+    This function returns an iterator which returns a tuple:
+    (from line tuple, to line tuple, boolean flag)
+    from/to line tuple -- (line num, line text)
+        line num -- integer or None (to indicate a context separation)
+        line text -- original line text with following markers inserted:
+            '\0+' -- marks start of added text
+            '\0-' -- marks start of deleted text
+            '\0^' -- marks start of changed text
+            '\1' -- marks end of added/deleted/changed text
+    boolean flag -- None indicates context separation, True indicates
+        either "from" or "to" line contains a change, otherwise False.
+    This function/iterator was originally developed to generate side by side
+    file difference for making HTML pages (see HtmlDiff class for example
+    usage).
+    Note, this function utilizes the ndiff function to generate the side by
+    side difference markup.  Optional ndiff arguments may be passed to this
+    function and they in turn will be passed to ndiff.
+    """
+    import re
+    # regular expression for finding intraline change indices
+    change_re = re.compile(r'(\++|\-+|\^+)')
+    # create the difference iterator to generate the differences
+    diff_lines_iterator = ndiff(fromlines,tolines,linejunk,charjunk)
+    def _make_line(lines, format_key, side, num_lines=[0,0]):
+        """Returns line of text with user's change markup and line formatting.
+        lines -- list of lines from the ndiff generator to produce a line of
+                 text from.  When producing the line of text to return, the
+                 lines used are removed from this list.
+        format_key -- '+' return first line in list with "add" markup around
+                          the entire line.
+                      '-' return first line in list with "delete" markup around
+                          the entire line.
+                      '?' return first line in list with add/delete/change
+                          intraline markup (indices obtained from second line)
+                      None return first line in list with no markup
+        side -- indice into the num_lines list (0=from,1=to)
+        num_lines -- from/to current line number.  This is NOT intended to be a
+                     passed parameter.  It is present as a keyword argument to
+                     maintain memory of the current line numbers between calls
+                     of this function.
+        Note, this function is purposefully not defined at the module scope so
+        that data it needs from its parent function (within whose context it
+        is defined) does not need to be of module scope.
+        """
+        num_lines[side] += 1
+        # Handle case where no user markup is to be added, just return line of
+        # text with user's line format to allow for usage of the line number.
+        if format_key is None:
+            return (num_lines[side],lines.pop(0)[2:])
+        # Handle case of intraline changes
+        if format_key == '?':
+            text, markers = lines.pop(0), lines.pop(0)
+            # find intraline changes (store change type and indices in tuples)
+            sub_info = []
+            def record_sub_info(match_object,sub_info=sub_info):
+                sub_info.append([match_object.group(1)[0],match_object.span()])
+                return match_object.group(1)
+            change_re.sub(record_sub_info,markers)
+            # process each tuple inserting our special marks that won't be
+            # noticed by an xml/html escaper.
+            for key,(begin,end) in reversed(sub_info):
+                text = text[0:begin]+'\0'+key+text[begin:end]+'\1'+text[end:]
+            text = text[2:]
+        # Handle case of add/delete entire line
+        else:
+            text = lines.pop(0)[2:]
+            # if line of text is just a newline, insert a space so there is
+            # something for the user to highlight and see.
+            if not text:
+                text = ' '
+            # insert marks that won't be noticed by an xml/html escaper.
+            text = '\0' + format_key + text + '\1'
+        # Return line of text, first allow user's line formatter to do its
+        # thing (such as adding the line number) then replace the special
+        # marks with what the user's change markup.
+        return (num_lines[side],text)
+    def _line_iterator():
+        """Yields from/to lines of text with a change indication.
+        This function is an iterator.  It itself pulls lines from a
+        differencing iterator, processes them and yields them.  When it can
+        it yields both a "from" and a "to" line, otherwise it will yield one
+        or the other.  In addition to yielding the lines of from/to text, a
+        boolean flag is yielded to indicate if the text line(s) have
+        differences in them.
+        Note, this function is purposefully not defined at the module scope so
+        that data it needs from its parent function (within whose context it
+        is defined) does not need to be of module scope.
+        """
+        lines = []
+        num_blanks_pending, num_blanks_to_yield = 0, 0
+        while True:
+            # Load up next 4 lines so we can look ahead, create strings which
+            # are a concatenation of the first character of each of the 4 lines
+            # so we can do some very readable comparisons.
+            while len(lines) < 4:
+                lines.append(next(diff_lines_iterator, 'X'))
+            s = ''.join([line[0] for line in lines])
+            if s.startswith('X'):
+                # When no more lines, pump out any remaining blank lines so the
+                # corresponding add/delete lines get a matching blank line so
+                # all line pairs get yielded at the next level.
+                num_blanks_to_yield = num_blanks_pending
+            elif s.startswith('-?+?'):
+                # simple intraline change
+                yield _make_line(lines,'?',0), _make_line(lines,'?',1), True
+                continue
+            elif s.startswith('--++'):
+                # in delete block, add block coming: we do NOT want to get
+                # caught up on blank lines yet, just process the delete line
+                num_blanks_pending -= 1
+                yield _make_line(lines,'-',0), None, True
+                continue
+            elif s.startswith(('--?+', '--+', '- ')):
+                # in delete block and see an intraline change or unchanged line
+                # coming: yield the delete line and then blanks
+                from_line,to_line = _make_line(lines,'-',0), None
+                num_blanks_to_yield,num_blanks_pending = num_blanks_pending-1,0
+            elif s.startswith('-+?'):
+                # intraline change
+                yield _make_line(lines,None,0), _make_line(lines,'?',1), True
+                continue
+            elif s.startswith('-?+'):
+                # intraline change
+                yield _make_line(lines,'?',0), _make_line(lines,None,1), True
+                continue
+            elif s.startswith('-'):
+                # delete FROM line
+                num_blanks_pending -= 1
+                yield _make_line(lines,'-',0), None, True
+                continue
+            elif s.startswith('+--'):
+                # in add block, delete block coming: we do NOT want to get
+                # caught up on blank lines yet, just process the add line
+                num_blanks_pending += 1
+                yield None, _make_line(lines,'+',1), True
+                continue
+            elif s.startswith(('+ ', '+-')):
+                # will be leaving an add block: yield blanks then add line
+                from_line, to_line = None, _make_line(lines,'+',1)
+                num_blanks_to_yield,num_blanks_pending = num_blanks_pending+1,0
+            elif s.startswith('+'):
+                # inside an add block, yield the add line
+                num_blanks_pending += 1
+                yield None, _make_line(lines,'+',1), True
+                continue
+            elif s.startswith(' '):
+                # unchanged text, yield it to both sides
+                yield _make_line(lines[:],None,0),_make_line(lines,None,1),False
+                continue
+            # Catch up on the blank lines so when we yield the next from/to
+            # pair, they are lined up.
+            while(num_blanks_to_yield < 0):
+                num_blanks_to_yield += 1
+                yield None,('','\n'),True
+            while(num_blanks_to_yield > 0):
+                num_blanks_to_yield -= 1
+                yield ('','\n'),None,True
+            if s.startswith('X'):
+                return
+            else:
+                yield from_line,to_line,True
+    def _line_pair_iterator():
+        """Yields from/to lines of text with a change indication.
+        This function is an iterator.  It itself pulls lines from the line
+        iterator.  Its difference from that iterator is that this function
+        always yields a pair of from/to text lines (with the change
+        indication).  If necessary it will collect single from/to lines
+        until it has a matching pair from/to pair to yield.
+        Note, this function is purposefully not defined at the module scope so
+        that data it needs from its parent function (within whose context it
+        is defined) does not need to be of module scope.
+        """
+        line_iterator = _line_iterator()
+        fromlines,tolines=[],[]
+        while True:
+            # Collecting lines of text until we have a from/to pair
+            while (len(fromlines)==0 or len(tolines)==0):
+                try:
+                    from_line, to_line, found_diff = next(line_iterator)
+                except StopIteration:
+                    return
+                if from_line is not None:
+                    fromlines.append((from_line,found_diff))
+                if to_line is not None:
+                    tolines.append((to_line,found_diff))
+            # Once we have a pair, remove them from the collection and yield it
+            from_line, fromDiff = fromlines.pop(0)
+            to_line, to_diff = tolines.pop(0)
+            yield (from_line,to_line,fromDiff or to_diff)
+    # Handle case where user does not want context differencing, just yield
+    # them up without doing anything else with them.
+    line_pair_iterator = _line_pair_iterator()
+    if context is None:
+        yield from line_pair_iterator
+    # Handle case where user wants context differencing.  We must do some
+    # storage of lines until we know for sure that they are to be yielded.
+    else:
+        context += 1
+        lines_to_write = 0
+        while True:
+            # Store lines up until we find a difference, note use of a
+            # circular queue because we only need to keep around what
+            # we need for context.
+            index, contextLines = 0, [None]*(context)
+            found_diff = False
+            while(found_diff is False):
+                try:
+                    from_line, to_line, found_diff = next(line_pair_iterator)
+                except StopIteration:
+                    return
+                i = index % context
+                contextLines[i] = (from_line, to_line, found_diff)
+                index += 1
+            # Yield lines that we have collected so far, but first yield
+            # the user's separator.
+            if index > context:
+                yield None, None, None
+                lines_to_write = context
+            else:
+                lines_to_write = index
+                index = 0
+            while(lines_to_write):
+                i = index % context
+                index += 1
+                yield contextLines[i]
+                lines_to_write -= 1
+            # Now yield the context lines after the change
+            lines_to_write = context-1
+            try:
+                while(lines_to_write):
+                    from_line, to_line, found_diff = next(line_pair_iterator)
+                    # If another change within the context, extend the context
+                    if found_diff:
+                        lines_to_write = context-1
+                    else:
+                        lines_to_write -= 1
+                    yield from_line, to_line, found_diff
+            except StopIteration:
+                # Catch exception from next() and return normally
+                return
+_file_template = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="%(charset)s">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Diff comparison</title>
+    <style>%(styles)s
+    </style>
+</head>
+<body>
+    %(table)s%(legend)s
+</body>
+</html>"""
+_styles = """
+        :root {color-scheme: light dark}
+        table.diff {
+            font-family: Menlo, Consolas, Monaco, Liberation Mono, Lucida Console, monospace;
+            border: medium;
+        }
+        .diff_header {
+            background-color: #e0e0e0;
+            font-weight: bold;
+        }
+        td.diff_header {
+            text-align: right;
+            padding: 0 8px;
+        }
+        .diff_next {
+            background-color: #c0c0c0;
+            padding: 4px 0;
+        }
+        .diff_add {background-color:palegreen}
+        .diff_chg {background-color:#ffff77}
+        .diff_sub {background-color:#ffaaaa}
+        table.diff[summary="Legends"] {
+            margin-top: 20px;
+            border: 1px solid #ccc;
+        }
+        table.diff[summary="Legends"] th {
+            background-color: #e0e0e0;
+            padding: 4px 8px;
+        }
+        table.diff[summary="Legends"] td {
+            padding: 4px 8px;
+        }
+        @media (prefers-color-scheme: dark) {
+            .diff_header {background-color:#666}
+            .diff_next {background-color:#393939}
+            .diff_add {background-color:darkgreen}
+            .diff_chg {background-color:#847415}
+            .diff_sub {background-color:darkred}
+            table.diff[summary="Legends"] {border-color:#555}
+            table.diff[summary="Legends"] th{background-color:#666}
+        }"""
+_table_template = """
+    <table class="diff" id="difflib_chg_%(prefix)s_top"
+           cellspacing="0" cellpadding="0" rules="groups" >
+        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
+        <colgroup></colgroup> <colgroup></colgroup> <colgroup></colgroup>
+        %(header_row)s
+        <tbody>
+%(data_rows)s        </tbody>
+    </table>"""
+_legend = """
+    <table class="diff" summary="Legends">
+        <tr> <th colspan="2"> Legends </th> </tr>
+        <tr> <td> <table border="" summary="Colors">
+                      <tr><th> Colors </th> </tr>
+                      <tr><td class="diff_add">&nbsp;Added&nbsp;</td></tr>
+                      <tr><td class="diff_chg">Changed</td> </tr>
+                      <tr><td class="diff_sub">Deleted</td> </tr>
+                  </table></td>
+             <td> <table border="" summary="Links">
+                      <tr><th colspan="2"> Links </th> </tr>
+                      <tr><td>(f)irst change</td> </tr>
+                      <tr><td>(n)ext change</td> </tr>
+                      <tr><td>(t)op</td> </tr>
+                  </table></td> </tr>
+    </table>"""
+class HtmlDiff(object):
+    """For producing HTML side by side comparison with change highlights.
+    This class can be used to create an HTML table (or a complete HTML file
+    containing the table) showing a side by side, line by line comparison
+    of text with inter-line and intra-line change highlights.  The table can
+    be generated in either full or contextual difference mode.
+    The following methods are provided for HTML generation:
+    make_table -- generates HTML for a single side by side table
+    make_file -- generates complete HTML file with a single side by side table
+    See Doc/includes/diff.py for an example usage of this class.
+    """
+    _file_template = _file_template
+    _styles = _styles
+    _table_template = _table_template
+    _legend = _legend
+    _default_prefix = 0
+    def __init__(self,tabsize=8,wrapcolumn=None,linejunk=None,
+                 charjunk=IS_CHARACTER_JUNK):
+        """HtmlDiff instance initializer
+        Arguments:
+        tabsize -- tab stop spacing, defaults to 8.
+        wrapcolumn -- column number where lines are broken and wrapped,
+            defaults to None where lines are not wrapped.
+        linejunk,charjunk -- keyword arguments passed into ndiff() (used by
+            HtmlDiff() to generate the side by side HTML differences).  See
+            ndiff() documentation for argument default values and descriptions.
+        """
+        self._tabsize = tabsize
+        self._wrapcolumn = wrapcolumn
+        self._linejunk = linejunk
+        self._charjunk = charjunk
+    def make_file(self, fromlines, tolines, fromdesc='', todesc='',
+                  context=False, numlines=5, *, charset='utf-8'):
+        """Returns HTML file of side by side comparison with change highlights
+        Arguments:
+        fromlines -- list of "from" lines
+        tolines -- list of "to" lines
+        fromdesc -- "from" file column header string
+        todesc -- "to" file column header string
+        context -- set to True for contextual differences (defaults to False
+            which shows full differences).
+        numlines -- number of context lines.  When context is set True,
+            controls number of lines displayed before and after the change.
+            When context is False, controls the number of lines to place
+            the "next" link anchors before the next change (so click of
+            "next" link jumps to just before the change).
+        charset -- charset of the HTML document
+        """
+        return (self._file_template % dict(
+            styles=self._styles,
+            legend=self._legend,
+            table=self.make_table(fromlines, tolines, fromdesc, todesc,
+                                  context=context, numlines=numlines),
+            charset=charset
+        )).encode(charset, 'xmlcharrefreplace').decode(charset)
+    def _tab_newline_replace(self,fromlines,tolines):
+        """Returns from/to line lists with tabs expanded and newlines removed.
+        Instead of tab characters being replaced by the number of spaces
+        needed to fill in to the next tab stop, this function will fill
+        the space with tab characters.  This is done so that the difference
+        algorithms can identify changes in a file when tabs are replaced by
+        spaces and vice versa.  At the end of the HTML generation, the tab
+        characters will be replaced with a nonbreakable space.
+        """
+        def expand_tabs(line):
+            # hide real spaces
+            line = line.replace(' ','\0')
+            # expand tabs into spaces
+            line = line.expandtabs(self._tabsize)
+            # replace spaces from expanded tabs back into tab characters
+            # (we'll replace them with markup after we do differencing)
+            line = line.replace(' ','\t')
+            return line.replace('\0',' ').rstrip('\n')
+        fromlines = [expand_tabs(line) for line in fromlines]
+        tolines = [expand_tabs(line) for line in tolines]
+        return fromlines,tolines
+    def _split_line(self,data_list,line_num,text):
+        """Builds list of text lines by splitting text lines at wrap point
+        This function will determine if the input text line needs to be
+        wrapped (split) into separate lines.  If so, the first wrap point
+        will be determined and the first line appended to the output
+        text line list.  This function is used recursively to handle
+        the second part of the split line to further split it.
+        """
+        # if blank line or context separator, just add it to the output list
+        if not line_num:
+            data_list.append((line_num,text))
+            return
+        # if line text doesn't need wrapping, just add it to the output list
+        size = len(text)
+        max = self._wrapcolumn
+        if (size <= max) or ((size -(text.count('\0')*3)) <= max):
+            data_list.append((line_num,text))
+            return
+        # scan text looking for the wrap point, keeping track if the wrap
+        # point is inside markers
+        i = 0
+        n = 0
+        mark = ''
+        while n < max and i < size:
+            if text[i] == '\0':
+                i += 1
+                mark = text[i]
+                i += 1
+            elif text[i] == '\1':
+                i += 1
+                mark = ''
+            else:
+                i += 1
+                n += 1
+        # wrap point is inside text, break it up into separate lines
+        line1 = text[:i]
+        line2 = text[i:]
+        # if wrap point is inside markers, place end marker at end of first
+        # line and start marker at beginning of second line because each
+        # line will have its own table tag markup around it.
+        if mark:
+            line1 = line1 + '\1'
+            line2 = '\0' + mark + line2
+        # tack on first line onto the output list
+        data_list.append((line_num,line1))
+        # use this routine again to wrap the remaining text
+        self._split_line(data_list,'>',line2)
+    def _line_wrapper(self,diffs):
+        """Returns iterator that splits (wraps) mdiff text lines"""
+        # pull from/to data and flags from mdiff iterator
+        for fromdata,todata,flag in diffs:
+            # check for context separators and pass them through
+            if flag is None:
+                yield fromdata,todata,flag
+                continue
+            (fromline,fromtext),(toline,totext) = fromdata,todata
+            # for each from/to line split it at the wrap column to form
+            # list of text lines.
+            fromlist,tolist = [],[]
+            self._split_line(fromlist,fromline,fromtext)
+            self._split_line(tolist,toline,totext)
+            # yield from/to line in pairs inserting blank lines as
+            # necessary when one side has more wrapped lines
+            while fromlist or tolist:
+                if fromlist:
+                    fromdata = fromlist.pop(0)
+                else:
+                    fromdata = ('',' ')
+                if tolist:
+                    todata = tolist.pop(0)
+                else:
+                    todata = ('',' ')
+                yield fromdata,todata,flag
+    def _collect_lines(self,diffs):
+        """Collects mdiff output into separate lists
+        Before storing the mdiff from/to data into a list, it is converted
+        into a single line of text with HTML markup.
+        """
+        fromlist,tolist,flaglist = [],[],[]
+        # pull from/to data and flags from mdiff style iterator
+        for fromdata,todata,flag in diffs:
+            try:
+                # store HTML markup of the lines into the lists
+                fromlist.append(self._format_line(0,flag,*fromdata))
+                tolist.append(self._format_line(1,flag,*todata))
+            except TypeError:
+                # exceptions occur for lines where context separators go
+                fromlist.append(None)
+                tolist.append(None)
+            flaglist.append(flag)
+        return fromlist,tolist,flaglist
+    def _format_line(self,side,flag,linenum,text):
+        """Returns HTML markup of "from" / "to" text lines
+        side -- 0 or 1 indicating "from" or "to" text
+        flag -- indicates if difference on line
+        linenum -- line number (used for line number column)
+        text -- line text to be marked up
+        """
+        try:
+            linenum = '%d' % linenum
+            id = ' id="%s%s"' % (self._prefix[side],linenum)
+        except TypeError:
+            # handle blank lines where linenum is '>' or ''
+            id = ''
+        # replace those things that would get confused with HTML symbols
+        text=text.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
+        # make space non-breakable so they don't get compressed or line wrapped
+        text = text.replace(' ','&nbsp;').rstrip()
+        return '<td class="diff_header"%s>%s</td><td nowrap="nowrap">%s</td>' \
+               % (id,linenum,text)
+    def _make_prefix(self):
+        """Create unique anchor prefixes"""
+        # Generate a unique anchor prefix so multiple tables
+        # can exist on the same HTML page without conflicts.
+        fromprefix = "from%d_" % HtmlDiff._default_prefix
+        toprefix = "to%d_" % HtmlDiff._default_prefix
+        HtmlDiff._default_prefix += 1
+        # store prefixes so line format method has access
+        self._prefix = [fromprefix,toprefix]
+    def _convert_flags(self,fromlist,tolist,flaglist,context,numlines):
+        """Makes list of "next" links"""
+        # all anchor names will be generated using the unique "to" prefix
+        toprefix = self._prefix[1]
+        # process change flags, generating middle column of next anchors/links
+        next_id = ['']*len(flaglist)
+        next_href = ['']*len(flaglist)
+        num_chg, in_change = 0, False
+        last = 0
+        for i,flag in enumerate(flaglist):
+            if flag:
+                if not in_change:
+                    in_change = True
+                    last = i
+                    # at the beginning of a change, drop an anchor a few lines
+                    # (the context lines) before the change for the previous
+                    # link
+                    i = max([0,i-numlines])
+                    next_id[i] = ' id="difflib_chg_%s_%d"' % (toprefix,num_chg)
+                    # at the beginning of a change, drop a link to the next
+                    # change
+                    num_chg += 1
+                    next_href[last] = '<a href="#difflib_chg_%s_%d">n</a>' % (
+                         toprefix,num_chg)
+            else:
+                in_change = False
+        # check for cases where there is no content to avoid exceptions
+        if not flaglist:
+            flaglist = [False]
+            next_id = ['']
+            next_href = ['']
+            last = 0
+            if context:
+                fromlist = ['<td></td><td>&nbsp;No Differences Found&nbsp;</td>']
+                tolist = fromlist
+            else:
+                fromlist = tolist = ['<td></td><td>&nbsp;Empty File&nbsp;</td>']
+        # if not a change on first line, drop a link
+        if not flaglist[0]:
+            next_href[0] = '<a href="#difflib_chg_%s_0">f</a>' % toprefix
+        # redo the last link to link to the top
+        next_href[last] = '<a href="#difflib_chg_%s_top">t</a>' % (toprefix)
+        return fromlist,tolist,flaglist,next_href,next_id
+    def make_table(self,fromlines,tolines,fromdesc='',todesc='',context=False,
+                   numlines=5):
+        """Returns HTML table of side by side comparison with change highlights
+        Arguments:
+        fromlines -- list of "from" lines
+        tolines -- list of "to" lines
+        fromdesc -- "from" file column header string
+        todesc -- "to" file column header string
+        context -- set to True for contextual differences (defaults to False
+            which shows full differences).
+        numlines -- number of context lines.  When context is set True,
+            controls number of lines displayed before and after the change.
+            When context is False, controls the number of lines to place
+            the "next" link anchors before the next change (so click of
+            "next" link jumps to just before the change).
+        """
+        # make unique anchor prefixes so that multiple tables may exist
+        # on the same page without conflict.
+        self._make_prefix()
+        # change tabs to spaces before it gets more difficult after we insert
+        # markup
+        fromlines,tolines = self._tab_newline_replace(fromlines,tolines)
+        # create diffs iterator which generates side by side from/to data
+        if context:
+            context_lines = numlines
+        else:
+            context_lines = None
+        diffs = _mdiff(fromlines,tolines,context_lines,linejunk=self._linejunk,
+                      charjunk=self._charjunk)
+        # set up iterator to wrap lines that exceed desired width
+        if self._wrapcolumn:
+            diffs = self._line_wrapper(diffs)
+        # collect up from/to lines and flags into lists (also format the lines)
+        fromlist,tolist,flaglist = self._collect_lines(diffs)
+        # process change flags, generating middle column of next anchors/links
+        fromlist,tolist,flaglist,next_href,next_id = self._convert_flags(
+            fromlist,tolist,flaglist,context,numlines)
+        s = []
+        fmt = '            <tr><td class="diff_next"%s>%s</td>%s' + \
+              '<td class="diff_next">%s</td>%s</tr>\n'
+        for i in range(len(flaglist)):
+            if flaglist[i] is None:
+                # mdiff yields None on separator lines skip the bogus ones
+                # generated for the first line
+                if i > 0:
+                    s.append('        </tbody>        \n        <tbody>\n')
+            else:
+                s.append( fmt % (next_id[i],next_href[i],fromlist[i],
+                                           next_href[i],tolist[i]))
+        if fromdesc or todesc:
+            header_row = '<thead><tr>%s%s%s%s</tr></thead>' % (
+                '<th class="diff_next"><br /></th>',
+                '<th colspan="2" class="diff_header">%s</th>' % fromdesc,
+                '<th class="diff_next"><br /></th>',
+                '<th colspan="2" class="diff_header">%s</th>' % todesc)
+        else:
+            header_row = ''
+        table = self._table_template % dict(
+            data_rows=''.join(s),
+            header_row=header_row,
+            prefix=self._prefix[1])
+        return table.replace('\0+','<span class="diff_add">'). \
+                     replace('\0-','<span class="diff_sub">'). \
+                     replace('\0^','<span class="diff_chg">'). \
+                     replace('\1','</span>'). \
+                     replace('\t','&nbsp;')
+def restore(delta, which):
+    r"""
+    Generate one of the two sequences that generated a delta.
+    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract
+    lines originating from file 1 or 2 (parameter `which`), stripping off line
+    prefixes.
+    Examples:
+    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(keepends=True),
+    ...              'ore\ntree\nemu\n'.splitlines(keepends=True))
+    >>> diff = list(diff)
+    >>> print(''.join(restore(diff, 1)), end="")
+    one
+    two
+    three
+    >>> print(''.join(restore(diff, 2)), end="")
+    ore
+    tree
+    emu
+    """
+    try:
+        tag = {1: "- ", 2: "+ "}[int(which)]
+    except KeyError:
+        raise ValueError('unknown delta choice (must be 1 or 2): %r'
+                           % which) from None
+    prefixes = ("  ", tag)
+    for line in delta:
+        if line[:2] in prefixes:
+            yield line[2:]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/formfill.py ADDED Viewed

	@@ -0,0 +1,299 @@

+from lxml.etree import XPath, ElementBase
+from lxml.html import fromstring, XHTML_NAMESPACE
+from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
+from lxml.html import defs
+import copy
+try:
+    basestring
+except NameError:
+    # Python 3
+    basestring = str
+__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
+           'insert_errors', 'insert_errors_html',
+           'DefaultErrorCreator']
+class FormNotFound(LookupError):
+    """
+    Raised when no form can be found
+    """
+_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
+_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
+                               namespaces={'x':XHTML_NAMESPACE})
+_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
+                               namespaces={'x':XHTML_NAMESPACE})
+_name_xpath = XPath('descendant-or-self::*[@name=$name]')
+def fill_form(
+    el,
+    values,
+    form_id=None,
+    form_index=None,
+    ):
+    el = _find_form(el, form_id=form_id, form_index=form_index)
+    _fill_form(el, values)
+def fill_form_html(html, values, form_id=None, form_index=None):
+    result_type = type(html)
+    if isinstance(html, basestring):
+        doc = fromstring(html)
+    else:
+        doc = copy.deepcopy(html)
+    fill_form(doc, values, form_id=form_id, form_index=form_index)
+    return _transform_result(result_type, doc)
+def _fill_form(el, values):
+    counts = {}
+    if hasattr(values, 'mixed'):
+        # For Paste request parameters
+        values = values.mixed()
+    inputs = _input_xpath(el)
+    for input in inputs:
+        name = input.get('name')
+        if not name:
+            continue
+        if _takes_multiple(input):
+            value = values.get(name, [])
+            if not isinstance(value, (list, tuple)):
+                value = [value]
+            _fill_multiple(input, value)
+        elif name not in values:
+            continue
+        else:
+            index = counts.get(name, 0)
+            counts[name] = index + 1
+            value = values[name]
+            if isinstance(value, (list, tuple)):
+                try:
+                    value = value[index]
+                except IndexError:
+                    continue
+            elif index > 0:
+                continue
+            _fill_single(input, value)
+def _takes_multiple(input):
+    if _nons(input.tag) == 'select' and input.get('multiple'):
+        # FIXME: multiple="0"?
+        return True
+    type = input.get('type', '').lower()
+    if type in ('radio', 'checkbox'):
+        return True
+    return False
+def _fill_multiple(input, value):
+    type = input.get('type', '').lower()
+    if type == 'checkbox':
+        v = input.get('value')
+        if v is None:
+            if not value:
+                result = False
+            else:
+                result = value[0]
+                if isinstance(value, basestring):
+                    # The only valid "on" value for an unnamed checkbox is 'on'
+                    result = result == 'on'
+            _check(input, result)
+        else:
+            _check(input, v in value)
+    elif type == 'radio':
+        v = input.get('value')
+        _check(input, v in value)
+    else:
+        assert _nons(input.tag) == 'select'
+        for option in _options_xpath(input):
+            v = option.get('value')
+            if v is None:
+                # This seems to be the default, at least on IE
+                # FIXME: but I'm not sure
+                v = option.text_content()
+            _select(option, v in value)
+def _check(el, check):
+    if check:
+        el.set('checked', '')
+    else:
+        if 'checked' in el.attrib:
+            del el.attrib['checked']
+def _select(el, select):
+    if select:
+        el.set('selected', '')
+    else:
+        if 'selected' in el.attrib:
+            del el.attrib['selected']
+def _fill_single(input, value):
+    if _nons(input.tag) == 'textarea':
+        input.text = value
+    else:
+        input.set('value', value)
+def _find_form(el, form_id=None, form_index=None):
+    if form_id is None and form_index is None:
+        forms = _forms_xpath(el)
+        for form in forms:
+            return form
+        raise FormNotFound(
+            "No forms in page")
+    if form_id is not None:
+        form = el.get_element_by_id(form_id)
+        if form is not None:
+            return form
+        forms = _form_name_xpath(el, name=form_id)
+        if forms:
+            return forms[0]
+        else:
+            raise FormNotFound(
+                "No form with the name or id of %r (forms: %s)"
+                % (id, ', '.join(_find_form_ids(el))))
+    if form_index is not None:
+        forms = _forms_xpath(el)
+        try:
+            return forms[form_index]
+        except IndexError:
+            raise FormNotFound(
+                "There is no form with the index %r (%i forms found)"
+                % (form_index, len(forms)))
+def _find_form_ids(el):
+    forms = _forms_xpath(el)
+    if not forms:
+        yield '(no forms)'
+        return
+    for index, form in enumerate(forms):
+        if form.get('id'):
+            if form.get('name'):
+                yield '%s or %s' % (form.get('id'),
+                                     form.get('name'))
+            else:
+                yield form.get('id')
+        elif form.get('name'):
+            yield form.get('name')
+        else:
+            yield '(unnamed form %s)' % index
+############################################################
+## Error filling
+############################################################
+class DefaultErrorCreator:
+    insert_before = True
+    block_inside = True
+    error_container_tag = 'div'
+    error_message_class = 'error-message'
+    error_block_class = 'error-block'
+    default_message = "Invalid"
+    def __init__(self, **kw):
+        for name, value in kw.items():
+            if not hasattr(self, name):
+                raise TypeError(
+                    "Unexpected keyword argument: %s" % name)
+            setattr(self, name, value)
+    def __call__(self, el, is_block, message):
+        error_el = el.makeelement(self.error_container_tag)
+        if self.error_message_class:
+            error_el.set('class', self.error_message_class)
+        if is_block and self.error_block_class:
+            error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
+        if message is None or message == '':
+            message = self.default_message
+        if isinstance(message, ElementBase):
+            error_el.append(message)
+        else:
+            assert isinstance(message, basestring), (
+                "Bad message; should be a string or element: %r" % message)
+            error_el.text = message or self.default_message
+        if is_block and self.block_inside:
+            if self.insert_before:
+                error_el.tail = el.text
+                el.text = None
+                el.insert(0, error_el)
+            else:
+                el.append(error_el)
+        else:
+            parent = el.getparent()
+            pos = parent.index(el)
+            if self.insert_before:
+                parent.insert(pos, error_el)
+            else:
+                error_el.tail = el.tail
+                el.tail = None
+                parent.insert(pos+1, error_el)
+default_error_creator = DefaultErrorCreator()
+def insert_errors(
+    el,
+    errors,
+    form_id=None,
+    form_index=None,
+    error_class="error",
+    error_creator=default_error_creator,
+    ):
+    el = _find_form(el, form_id=form_id, form_index=form_index)
+    for name, error in errors.items():
+        if error is None:
+            continue
+        for error_el, message in _find_elements_for_name(el, name, error):
+            assert isinstance(message, (basestring, type(None), ElementBase)), (
+                "Bad message: %r" % message)
+            _insert_error(error_el, message, error_class, error_creator)
+def insert_errors_html(html, values, **kw):
+    result_type = type(html)
+    if isinstance(html, basestring):
+        doc = fromstring(html)
+    else:
+        doc = copy.deepcopy(html)
+    insert_errors(doc, values, **kw)
+    return _transform_result(result_type, doc)
+def _insert_error(el, error, error_class, error_creator):
+    if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
+        is_block = False
+    else:
+        is_block = True
+    if _nons(el.tag) != 'form' and error_class:
+        _add_class(el, error_class)
+    if el.get('id'):
+        labels = _label_for_xpath(el, for_id=el.get('id'))
+        if labels:
+            for label in labels:
+                _add_class(label, error_class)
+    error_creator(el, is_block, error)
+def _add_class(el, class_name):
+    if el.get('class'):
+        el.set('class', el.get('class')+' '+class_name)
+    else:
+        el.set('class', class_name)
+def _find_elements_for_name(form, name, error):
+    if name is None:
+        # An error for the entire form
+        yield form, error
+        return
+    if name.startswith('#'):
+        # By id
+        el = form.get_element_by_id(name[1:])
+        if el is not None:
+            yield el, error
+        return
+    els = _name_xpath(form, name=name)
+    if not els:
+        # FIXME: should this raise an exception?
+        return
+    if not isinstance(error, (list, tuple)):
+        yield els[0], error
+        return
+    # FIXME: if error is longer than els, should it raise an error?
+    for el, err in zip(els, error):
+        if err is None:
+            continue
+        yield el, err

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/html/html5parser.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""
+An interface to html5lib that mimics the lxml.html interface.
+"""
+import sys
+import string
+from html5lib import HTMLParser as _HTMLParser
+from html5lib.treebuilders.etree_lxml import TreeBuilder
+from lxml import etree
+from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
+# python3 compatibility
+try:
+    _strings = basestring
+except NameError:
+    _strings = (bytes, str)
+try:
+    from urllib2 import urlopen
+except ImportError:
+    from urllib.request import urlopen
+try:
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
+class HTMLParser(_HTMLParser):
+    """An html5lib HTML parser with lxml as tree."""
+    def __init__(self, strict=False, **kwargs):
+        _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+try:
+    from html5lib import XHTMLParser as _XHTMLParser
+except ImportError:
+    pass
+else:
+    class XHTMLParser(_XHTMLParser):
+        """An html5lib XHTML Parser with lxml as tree."""
+        def __init__(self, strict=False, **kwargs):
+            _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+    xhtml_parser = XHTMLParser()
+def _find_tag(tree, tag):
+    elem = tree.find(tag)
+    if elem is not None:
+        return elem
+    return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
+def document_fromstring(html, guess_charset=None, parser=None):
+    """
+    Parse a whole document into a string.
+    If `guess_charset` is true, or if the input is not Unicode but a
+    byte string, the `chardet` library will perform charset guessing
+    on the string.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+    if parser is None:
+        parser = html_parser
+    options = {}
+    if guess_charset is None and isinstance(html, bytes):
+        # html5lib does not accept useChardet as an argument, if it
+        # detected the html argument would produce unicode objects.
+        guess_charset = True
+    if guess_charset is not None:
+        options['useChardet'] = guess_charset
+    return parser.parse(html, **options).getroot()
+def fragments_fromstring(html, no_leading_text=False,
+                         guess_charset=None, parser=None):
+    """Parses several HTML elements, returning a list of elements.
+    The first item in the list may be a string.  If no_leading_text is true,
+    then it will be an error if there is leading text, and it will always be
+    a list of only elements.
+    If `guess_charset` is true, the `chardet` library will perform charset
+    guessing on the string.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+    if parser is None:
+        parser = html_parser
+    options = {}
+    if guess_charset is None and isinstance(html, bytes):
+        # html5lib does not accept useChardet as an argument, if it
+        # detected the html argument would produce unicode objects.
+        guess_charset = False
+    if guess_charset is not None:
+        options['useChardet'] = guess_charset
+    children = parser.parseFragment(html, 'div', **options)
+    if children and isinstance(children[0], _strings):
+        if no_leading_text:
+            if children[0].strip():
+                raise etree.ParserError('There is leading text: %r' %
+                                        children[0])
+            del children[0]
+    return children
+def fragment_fromstring(html, create_parent=False,
+                        guess_charset=None, parser=None):
+    """Parses a single HTML element; it is an error if there is more than
+    one element, or if anything but whitespace precedes or follows the
+    element.
+    If 'create_parent' is true (or is a tag name) then a parent node
+    will be created to encapsulate the HTML in a single element.  In
+    this case, leading or trailing text is allowed.
+    If `guess_charset` is true, the `chardet` library will perform charset
+    guessing on the string.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+    accept_leading_text = bool(create_parent)
+    elements = fragments_fromstring(
+        html, guess_charset=guess_charset, parser=parser,
+        no_leading_text=not accept_leading_text)
+    if create_parent:
+        if not isinstance(create_parent, _strings):
+            create_parent = 'div'
+        new_root = Element(create_parent)
+        if elements:
+            if isinstance(elements[0], _strings):
+                new_root.text = elements[0]
+                del elements[0]
+            new_root.extend(elements)
+        return new_root
+    if not elements:
+        raise etree.ParserError('No elements found')
+    if len(elements) > 1:
+        raise etree.ParserError('Multiple elements found')
+    result = elements[0]
+    if result.tail and result.tail.strip():
+        raise etree.ParserError('Element followed by text: %r' % result.tail)
+    result.tail = None
+    return result
+def fromstring(html, guess_charset=None, parser=None):
+    """Parse the html, returning a single element/document.
+    This tries to minimally parse the chunk of text, without knowing if it
+    is a fragment or a document.
+    'base_url' will set the document's base_url attribute (and the tree's
+    docinfo.URL)
+    If `guess_charset` is true, or if the input is not Unicode but a
+    byte string, the `chardet` library will perform charset guessing
+    on the string.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+    doc = document_fromstring(html, parser=parser,
+                              guess_charset=guess_charset)
+    # document starts with doctype or <html>, full document!
+    start = html[:50]
+    if isinstance(start, bytes):
+        # Allow text comparison in python3.
+        # Decode as ascii, that also covers latin-1 and utf-8 for the
+        # characters we need.
+        start = start.decode('ascii', 'replace')
+    start = start.lstrip().lower()
+    if start.startswith('<html') or start.startswith('<!doctype'):
+        return doc
+    head = _find_tag(doc, 'head')
+    # if the head is not empty we have a full document
+    if len(head):
+        return doc
+    body = _find_tag(doc, 'body')
+    # The body has just one element, so it was probably a single
+    # element passed in
+    if (len(body) == 1 and (not body.text or not body.text.strip())
+        and (not body[-1].tail or not body[-1].tail.strip())):
+        return body[0]
+    # Now we have a body which represents a bunch of tags which have the
+    # content that was passed in.  We will create a fake container, which
+    # is the body tag, except <body> implies too much structure.
+    if _contains_block_level_tag(body):
+        body.tag = 'div'
+    else:
+        body.tag = 'span'
+    return body
+def parse(filename_url_or_file, guess_charset=None, parser=None):
+    """Parse a filename, URL, or file-like object into an HTML document
+    tree.  Note: this returns a tree, not an element.  Use
+    ``parse(...).getroot()`` to get the document root.
+    If ``guess_charset`` is true, the ``useChardet`` option is passed into
+    html5lib to enable character detection.  This option is on by default
+    when parsing from URLs, off by default when parsing from file(-like)
+    objects (which tend to return Unicode more often than not), and on by
+    default when parsing from a file path (which is read in binary mode).
+    """
+    if parser is None:
+        parser = html_parser
+    if not isinstance(filename_url_or_file, _strings):
+        fp = filename_url_or_file
+        if guess_charset is None:
+            # assume that file-like objects return Unicode more often than bytes
+            guess_charset = False
+    elif _looks_like_url(filename_url_or_file):
+        fp = urlopen(filename_url_or_file)
+        if guess_charset is None:
+            # assume that URLs return bytes
+            guess_charset = True
+    else:
+        fp = open(filename_url_or_file, 'rb')
+        if guess_charset is None:
+            guess_charset = True
+    options = {}
+    # html5lib does not accept useChardet as an argument, if it
+    # detected the html argument would produce unicode objects.
+    if guess_charset:
+        options['useChardet'] = guess_charset
+    return parser.parse(fp, **options)
+def _looks_like_url(str):
+    scheme = urlparse(str)[0]
+    if not scheme:
+        return False
+    elif (sys.platform == 'win32' and
+            scheme in string.ascii_letters
+            and len(scheme) == 1):
+        # looks like a 'normal' absolute path
+        return False
+    else:
+        return True
+html_parser = HTMLParser()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/__init__.pxd ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/config.pxd ADDED Viewed

	@@ -0,0 +1,3 @@

+cdef extern from "etree_defs.h":
+    cdef bint ENABLE_THREADING
+    cdef bint ENABLE_SCHEMATRON

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/relaxng.pxd ADDED Viewed

	@@ -0,0 +1,64 @@

+from lxml.includes.tree cimport xmlDoc
+from lxml.includes.xmlerror cimport xmlStructuredErrorFunc
+cdef extern from "libxml/relaxng.h" nogil:
+    ctypedef struct xmlRelaxNG
+    ctypedef struct xmlRelaxNGParserCtxt
+    ctypedef struct xmlRelaxNGValidCtxt
+    ctypedef enum xmlRelaxNGValidErr:
+        XML_RELAXNG_OK = 0
+        XML_RELAXNG_ERR_MEMORY = 1
+        XML_RELAXNG_ERR_TYPE = 2
+        XML_RELAXNG_ERR_TYPEVAL = 3
+        XML_RELAXNG_ERR_DUPID = 4
+        XML_RELAXNG_ERR_TYPECMP = 5
+        XML_RELAXNG_ERR_NOSTATE = 6
+        XML_RELAXNG_ERR_NODEFINE = 7
+        XML_RELAXNG_ERR_LISTEXTRA = 8
+        XML_RELAXNG_ERR_LISTEMPTY = 9
+        XML_RELAXNG_ERR_INTERNODATA = 10
+        XML_RELAXNG_ERR_INTERSEQ = 11
+        XML_RELAXNG_ERR_INTEREXTRA = 12
+        XML_RELAXNG_ERR_ELEMNAME = 13
+        XML_RELAXNG_ERR_ATTRNAME = 14
+        XML_RELAXNG_ERR_ELEMNONS = 15
+        XML_RELAXNG_ERR_ATTRNONS = 16
+        XML_RELAXNG_ERR_ELEMWRONGNS = 17
+        XML_RELAXNG_ERR_ATTRWRONGNS = 18
+        XML_RELAXNG_ERR_ELEMEXTRANS = 19
+        XML_RELAXNG_ERR_ATTREXTRANS = 20
+        XML_RELAXNG_ERR_ELEMNOTEMPTY = 21
+        XML_RELAXNG_ERR_NOELEM = 22
+        XML_RELAXNG_ERR_NOTELEM = 23
+        XML_RELAXNG_ERR_ATTRVALID = 24
+        XML_RELAXNG_ERR_CONTENTVALID = 25
+        XML_RELAXNG_ERR_EXTRACONTENT = 26
+        XML_RELAXNG_ERR_INVALIDATTR = 27
+        XML_RELAXNG_ERR_DATAELEM = 28
+        XML_RELAXNG_ERR_VALELEM = 29
+        XML_RELAXNG_ERR_LISTELEM = 30
+        XML_RELAXNG_ERR_DATATYPE = 31
+        XML_RELAXNG_ERR_VALUE = 32
+        XML_RELAXNG_ERR_LIST = 33
+        XML_RELAXNG_ERR_NOGRAMMAR = 34
+        XML_RELAXNG_ERR_EXTRADATA = 35
+        XML_RELAXNG_ERR_LACKDATA = 36
+        XML_RELAXNG_ERR_INTERNAL = 37
+        XML_RELAXNG_ERR_ELEMWRONG = 38
+        XML_RELAXNG_ERR_TEXTWRONG = 39
+    cdef xmlRelaxNGValidCtxt* xmlRelaxNGNewValidCtxt(xmlRelaxNG* schema)
+    cdef int xmlRelaxNGValidateDoc(xmlRelaxNGValidCtxt* ctxt, xmlDoc* doc)
+    cdef xmlRelaxNG* xmlRelaxNGParse(xmlRelaxNGParserCtxt* ctxt)
+    cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewParserCtxt(char* URL)
+    cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewDocParserCtxt(xmlDoc* doc)
+    cdef void xmlRelaxNGFree(xmlRelaxNG* schema)
+    cdef void xmlRelaxNGFreeParserCtxt(xmlRelaxNGParserCtxt* ctxt)
+    cdef void xmlRelaxNGFreeValidCtxt(xmlRelaxNGValidCtxt* ctxt)
+    cdef void xmlRelaxNGSetValidStructuredErrors(
+        xmlRelaxNGValidCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx)
+    cdef void xmlRelaxNGSetParserStructuredErrors(
+        xmlRelaxNGParserCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/schematron.pxd ADDED Viewed

	@@ -0,0 +1,34 @@

+from lxml.includes cimport xmlerror
+from lxml.includes.tree cimport xmlDoc
+cdef extern from "libxml/schematron.h" nogil:
+    ctypedef struct xmlSchematron
+    ctypedef struct xmlSchematronParserCtxt
+    ctypedef struct xmlSchematronValidCtxt
+    ctypedef enum xmlSchematronValidOptions:
+        XML_SCHEMATRON_OUT_QUIET     =    1 # quiet no report
+        XML_SCHEMATRON_OUT_TEXT      =    2 # build a textual report
+        XML_SCHEMATRON_OUT_XML       =    4 # output SVRL
+        XML_SCHEMATRON_OUT_ERROR     =    8 # output via xmlStructuredErrorFunc
+        XML_SCHEMATRON_OUT_FILE      =  256 # output to a file descriptor
+        XML_SCHEMATRON_OUT_BUFFER    =  512 # output to a buffer
+        XML_SCHEMATRON_OUT_IO        = 1024 # output to I/O mechanism
+    cdef xmlSchematronParserCtxt* xmlSchematronNewDocParserCtxt(
+        xmlDoc* doc)
+    cdef xmlSchematronParserCtxt* xmlSchematronNewParserCtxt(
+        char* filename) nogil
+    cdef xmlSchematronValidCtxt* xmlSchematronNewValidCtxt(
+        xmlSchematron* schema, int options)
+    cdef xmlSchematron* xmlSchematronParse(xmlSchematronParserCtxt* ctxt)
+    cdef int xmlSchematronValidateDoc(xmlSchematronValidCtxt* ctxt,
+                                      xmlDoc* instance)
+    cdef void xmlSchematronFreeParserCtxt(xmlSchematronParserCtxt* ctxt)
+    cdef void xmlSchematronFreeValidCtxt(xmlSchematronValidCtxt* ctxt)
+    cdef void xmlSchematronFree(xmlSchematron* schema)
+    cdef void xmlSchematronSetValidStructuredErrors(
+        xmlSchematronValidCtxt* ctxt,
+        xmlerror.xmlStructuredErrorFunc error_func, void *data)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/includes/xpath.pxd ADDED Viewed

	@@ -0,0 +1,136 @@

+from lxml.includes cimport tree
+from lxml.includes cimport xmlerror
+from libc.string cimport const_char
+from lxml.includes.tree cimport xmlChar, const_xmlChar
+cdef extern from "libxml/xpath.h" nogil:
+    ctypedef enum xmlXPathObjectType:
+        XPATH_UNDEFINED = 0
+        XPATH_NODESET = 1
+        XPATH_BOOLEAN = 2
+        XPATH_NUMBER = 3
+        XPATH_STRING = 4
+        XPATH_POINT = 5
+        XPATH_RANGE = 6
+        XPATH_LOCATIONSET = 7
+        XPATH_USERS = 8
+        XPATH_XSLT_TREE = 9
+    ctypedef enum xmlXPathError:
+        XPATH_EXPRESSION_OK = 0
+        XPATH_NUMBER_ERROR = 1
+        XPATH_UNFINISHED_LITERAL_ERROR = 2
+        XPATH_START_LITERAL_ERROR = 3
+        XPATH_VARIABLE_REF_ERROR = 4
+        XPATH_UNDEF_VARIABLE_ERROR = 5
+        XPATH_INVALID_PREDICATE_ERROR = 6
+        XPATH_EXPR_ERROR = 7
+        XPATH_UNCLOSED_ERROR = 8
+        XPATH_UNKNOWN_FUNC_ERROR = 9
+        XPATH_INVALID_OPERAND = 10
+        XPATH_INVALID_TYPE = 11
+        XPATH_INVALID_ARITY = 12
+        XPATH_INVALID_CTXT_SIZE = 13
+        XPATH_INVALID_CTXT_POSITION = 14
+        XPATH_MEMORY_ERROR = 15
+        XPTR_SYNTAX_ERROR = 16
+        XPTR_RESOURCE_ERROR = 17
+        XPTR_SUB_RESOURCE_ERROR = 18
+        XPATH_UNDEF_PREFIX_ERROR = 19
+        XPATH_ENCODING_ERROR = 20
+        XPATH_INVALID_CHAR_ERROR = 21
+        XPATH_INVALID_CTXT = 22
+    ctypedef struct xmlNodeSet:
+        int nodeNr
+        int nodeMax
+        tree.xmlNode** nodeTab
+    ctypedef struct xmlXPathObject:
+        xmlXPathObjectType type
+        xmlNodeSet* nodesetval
+        bint boolval
+        double floatval
+        xmlChar* stringval
+    ctypedef struct xmlXPathContext:
+        tree.xmlDoc* doc
+        tree.xmlNode* node
+        tree.xmlDict* dict
+        tree.xmlHashTable* nsHash
+        const_xmlChar* function
+        const_xmlChar* functionURI
+        xmlerror.xmlStructuredErrorFunc error
+        xmlerror.xmlError lastError
+        void* userData
+    ctypedef struct xmlXPathParserContext:
+        xmlXPathContext* context
+        xmlXPathObject* value
+        tree.xmlNode* ancestor
+        int error
+    ctypedef struct xmlXPathCompExpr
+    ctypedef void (*xmlXPathFunction)(xmlXPathParserContext* ctxt, int nargs)
+    ctypedef xmlXPathFunction (*xmlXPathFuncLookupFunc)(void* ctxt,
+                                                        const_xmlChar* name,
+                                                        const_xmlChar* ns_uri)
+    cdef xmlXPathContext* xmlXPathNewContext(tree.xmlDoc* doc)
+    cdef xmlXPathObject* xmlXPathEvalExpression(const_xmlChar* str,
+                                                xmlXPathContext* ctxt)
+    cdef xmlXPathObject* xmlXPathCompiledEval(xmlXPathCompExpr* comp,
+                                              xmlXPathContext* ctxt)
+    cdef xmlXPathCompExpr* xmlXPathCompile(const_xmlChar* str)
+    cdef xmlXPathCompExpr* xmlXPathCtxtCompile(xmlXPathContext* ctxt,
+                                               const_xmlChar* str)
+    cdef void xmlXPathFreeContext(xmlXPathContext* ctxt)
+    cdef void xmlXPathFreeCompExpr(xmlXPathCompExpr* comp)
+    cdef void xmlXPathFreeObject(xmlXPathObject* obj)
+    cdef int xmlXPathRegisterNs(xmlXPathContext* ctxt,
+                                const_xmlChar* prefix, const_xmlChar* ns_uri)
+    cdef xmlNodeSet* xmlXPathNodeSetCreate(tree.xmlNode* val)
+    cdef void xmlXPathFreeNodeSet(xmlNodeSet* val)
+cdef extern from "libxml/xpathInternals.h" nogil:
+    cdef int xmlXPathRegisterFunc(xmlXPathContext* ctxt,
+                                  const_xmlChar* name,
+                                  xmlXPathFunction f)
+    cdef int xmlXPathRegisterFuncNS(xmlXPathContext* ctxt,
+                                    const_xmlChar* name,
+                                    const_xmlChar* ns_uri,
+                                    xmlXPathFunction f)
+    cdef void xmlXPathRegisterFuncLookup(xmlXPathContext *ctxt,
+                                         xmlXPathFuncLookupFunc f,
+                                         void *funcCtxt)
+    cdef int xmlXPathRegisterVariable(xmlXPathContext *ctxt,
+                                      const_xmlChar* name,
+                                      xmlXPathObject* value)
+    cdef int xmlXPathRegisterVariableNS(xmlXPathContext *ctxt,
+                                        const_xmlChar* name,
+                                        const_xmlChar* ns_uri,
+                                        xmlXPathObject* value)
+    cdef void xmlXPathRegisteredVariablesCleanup(xmlXPathContext *ctxt)
+    cdef void xmlXPathRegisteredNsCleanup(xmlXPathContext *ctxt)
+    cdef xmlXPathObject* valuePop (xmlXPathParserContext *ctxt)
+    cdef int valuePush(xmlXPathParserContext* ctxt, xmlXPathObject *value)
+    cdef xmlXPathObject* xmlXPathNewCString(const_char *val)
+    cdef xmlXPathObject* xmlXPathWrapCString(const_char * val)
+    cdef xmlXPathObject* xmlXPathNewString(const_xmlChar *val)
+    cdef xmlXPathObject* xmlXPathWrapString(const_xmlChar * val)
+    cdef xmlXPathObject* xmlXPathNewFloat(double val)
+    cdef xmlXPathObject* xmlXPathNewBoolean(int val)
+    cdef xmlXPathObject* xmlXPathNewNodeSet(tree.xmlNode* val)
+    cdef xmlXPathObject* xmlXPathNewValueTree(tree.xmlNode* val)
+    cdef void xmlXPathNodeSetAdd(xmlNodeSet* cur,
+                                  tree.xmlNode* val)
+    cdef void xmlXPathNodeSetAddUnique(xmlNodeSet* cur,
+                                        tree.xmlNode* val)
+    cdef xmlXPathObject* xmlXPathWrapNodeSet(xmlNodeSet* val)
+    cdef void xmlXPathErr(xmlXPathParserContext* ctxt, int error)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/lxml/isoschematron/__init__.py ADDED Viewed

	@@ -0,0 +1,348 @@

+"""The ``lxml.isoschematron`` package implements ISO Schematron support on top
+of the pure-xslt 'skeleton' implementation.
+"""
+import sys
+import os.path
+from lxml import etree as _etree # due to validator __init__ signature
+# some compat stuff, borrowed from lxml.html
+try:
+    unicode
+except NameError:
+    # Python 3
+    unicode = str
+try:
+    basestring
+except NameError:
+    # Python 3
+    basestring = str
+__all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include',
+           'iso_abstract_expand', 'iso_svrl_for_xslt1',
+           'svrl_validation_errors', 'schematron_schema_valid',
+           'stylesheet_params', 'Schematron']
+# some namespaces
+#FIXME: Maybe lxml should provide a dedicated place for common namespace
+#FIXME: definitions?
+XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
+RELAXNG_NS = "http://relaxng.org/ns/structure/1.0"
+SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron"
+SVRL_NS = "http://purl.oclc.org/dsdl/svrl"
+# some helpers
+_schematron_root = '{%s}schema' % SCHEMATRON_NS
+_xml_schema_root = '{%s}schema' % XML_SCHEMA_NS
+_resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
+# the iso-schematron skeleton implementation steps aka xsl transformations
+extract_xsd = _etree.XSLT(_etree.parse(
+    os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl')))
+extract_rng = _etree.XSLT(_etree.parse(
+    os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl')))
+iso_dsdl_include = _etree.XSLT(_etree.parse(
+    os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
+                 'iso_dsdl_include.xsl')))
+iso_abstract_expand = _etree.XSLT(_etree.parse(
+    os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
+                 'iso_abstract_expand.xsl')))
+iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse(
+    os.path.join(_resources_dir,
+                 'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl')))
+# svrl result accessors
+svrl_validation_errors = _etree.XPath(
+    '//svrl:failed-assert', namespaces={'svrl': SVRL_NS})
+# RelaxNG validator for schematron schemas
+schematron_schema_valid_supported = False
+try:
+    schematron_schema_valid = _etree.RelaxNG(
+        file=os.path.join(_resources_dir, 'rng', 'iso-schematron.rng'))
+    schematron_schema_valid_supported = True
+except _etree.RelaxNGParseError:
+    # Some distributions delete the file due to licensing issues.
+    def schematron_schema_valid(arg):
+        raise NotImplementedError("Validating the ISO schematron requires iso-schematron.rng")
+def stylesheet_params(**kwargs):
+    """Convert keyword args to a dictionary of stylesheet parameters.
+    XSL stylesheet parameters must be XPath expressions, i.e.:
+    * string expressions, like "'5'"
+    * simple (number) expressions, like "5"
+    * valid XPath expressions, like "/a/b/text()"
+    This function converts native Python keyword arguments to stylesheet
+    parameters following these rules:
+    If an arg is a string wrap it with XSLT.strparam().
+    If an arg is an XPath object use its path string.
+    If arg is None raise TypeError.
+    Else convert arg to string.
+    """
+    result = {}
+    for key, val in kwargs.items():
+        if isinstance(val, basestring):
+            val = _etree.XSLT.strparam(val)
+        elif val is None:
+            raise TypeError('None not allowed as a stylesheet parameter')
+        elif not isinstance(val, _etree.XPath):
+            val = unicode(val)
+        result[key] = val
+    return result
+# helper function for use in Schematron __init__
+def _stylesheet_param_dict(paramsDict, kwargsDict):
+    """Return a copy of paramsDict, updated with kwargsDict entries, wrapped as
+    stylesheet arguments.
+    kwargsDict entries with a value of None are ignored.
+    """
+    # beware of changing mutable default arg
+    paramsDict = dict(paramsDict)
+    for k, v in kwargsDict.items():
+        if v is not None: # None values do not override
+            paramsDict[k] = v
+    paramsDict = stylesheet_params(**paramsDict)
+    return paramsDict
+class Schematron(_etree._Validator):
+    """An ISO Schematron validator.
+    Pass a root Element or an ElementTree to turn it into a validator.
+    Alternatively, pass a filename as keyword argument 'file' to parse from
+    the file system.
+    Schematron is a less well known, but very powerful schema language.
+    The main idea is to use the capabilities of XPath to put restrictions on
+    the structure and the content of XML documents.
+    The standard behaviour is to fail on ``failed-assert`` findings only
+    (``ASSERTS_ONLY``).  To change this, you can either pass a report filter
+    function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
+    or a custom ``XPath`` object), or subclass isoschematron.Schematron for
+    complete control of the validation process.
+    Built on the Schematron language 'reference' skeleton pure-xslt
+    implementation, the validator is created as an XSLT 1.0 stylesheet using
+    these steps:
+     0) (Extract from XML Schema or RelaxNG schema)
+     1) Process inclusions
+     2) Process abstract patterns
+     3) Compile the schematron schema to XSLT
+    The ``include`` and ``expand`` keyword arguments can be used to switch off
+    steps 1) and 2).
+    To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
+    keyword arguments ``include_params``, ``expand_params`` or
+    ``compile_params``.
+    For convenience, the compile-step parameter ``phase`` is also exposed as a
+    keyword argument ``phase``. This takes precedence if the parameter is also
+    given in the parameter dictionary.
+    If ``store_schematron`` is set to True, the (included-and-expanded)
+    schematron document tree is stored and available through the ``schematron``
+    property.
+    If ``store_xslt`` is set to True, the validation XSLT document tree will be
+    stored and can be retrieved through the ``validator_xslt`` property.
+    With ``store_report`` set to True (default: False), the resulting validation
+    report document gets stored and can be accessed as the ``validation_report``
+    property.
+    If ``validate_schema`` is set to False, the validation of the schema file
+    itself is disabled.  Validation happens by default after building the full
+    schema, unless the schema validation file cannot be found at import time,
+    in which case the validation gets disabled.  Some lxml distributions exclude
+    this file due to licensing issues.  ISO-Schematron validation can then still
+    be used normally, but the schemas themselves cannot be validated.
+    Here is a usage example::
+      >>> from lxml import etree
+      >>> from lxml.isoschematron import Schematron
+      >>> schematron = Schematron(etree.XML('''
+      ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+      ...   <pattern id="id_only_attribute">
+      ...     <title>id is the only permitted attribute name</title>
+      ...     <rule context="*">
+      ...       <report test="@*[not(name()='id')]">Attribute
+      ...         <name path="@*[not(name()='id')]"/> is forbidden<name/>
+      ...       </report>
+      ...     </rule>
+      ...   </pattern>
+      ... </schema>'''),
+      ... error_finder=Schematron.ASSERTS_AND_REPORTS)
+      >>> xml = etree.XML('''
+      ... <AAA name="aaa">
+      ...   <BBB id="bbb"/>
+      ...   <CCC color="ccc"/>
+      ... </AAA>
+      ... ''')
+      >>> schematron.validate(xml)
+      False
+      >>> xml = etree.XML('''
+      ... <AAA id="aaa">
+      ...   <BBB id="bbb"/>
+      ...   <CCC/>
+      ... </AAA>
+      ... ''')
+      >>> schematron.validate(xml)
+      True
+    """
+    # libxml2 error categorization for validation errors
+    _domain = _etree.ErrorDomains.SCHEMATRONV
+    _level = _etree.ErrorLevels.ERROR
+    _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT
+    # convenience definitions for common behaviours
+    ASSERTS_ONLY = svrl_validation_errors  # Default
+    ASSERTS_AND_REPORTS = _etree.XPath(
+        '//svrl:failed-assert | //svrl:successful-report',
+        namespaces={'svrl': SVRL_NS})
+    def _extract(self, element):
+        """Extract embedded schematron schema from non-schematron host schema.
+        This method will only be called by __init__ if the given schema document
+        is not a schematron schema by itself.
+        Must return a schematron schema document tree or None.
+        """
+        schematron = None
+        if element.tag == _xml_schema_root:
+            schematron = self._extract_xsd(element)
+        elif element.nsmap.get(element.prefix) == RELAXNG_NS:
+            # RelaxNG does not have a single unique root element
+            schematron = self._extract_rng(element)
+        return schematron
+    # customization points
+    # etree.XSLT objects that provide the extract, include, expand, compile
+    # steps
+    _extract_xsd = extract_xsd
+    _extract_rng = extract_rng
+    _include = iso_dsdl_include
+    _expand = iso_abstract_expand
+    _compile = iso_svrl_for_xslt1
+    # etree.xpath object that determines input document validity when applied to
+    # the svrl result report; must return a list of result elements (empty if
+    # valid)
+    _validation_errors = ASSERTS_ONLY
+    def __init__(self, etree=None, file=None, include=True, expand=True,
+                 include_params={}, expand_params={}, compile_params={},
+                 store_schematron=False, store_xslt=False, store_report=False,
+                 phase=None, error_finder=ASSERTS_ONLY,
+                 validate_schema=schematron_schema_valid_supported):
+        super().__init__()
+        self._store_report = store_report
+        self._schematron = None
+        self._validator_xslt = None
+        self._validation_report = None
+        if error_finder is not self.ASSERTS_ONLY:
+            self._validation_errors = error_finder
+        # parse schema document, may be a schematron schema or an XML Schema or
+        # a RelaxNG schema with embedded schematron rules
+        root = None
+        try:
+            if etree is not None:
+                if _etree.iselement(etree):
+                    root = etree
+                else:
+                    root = etree.getroot()
+            elif file is not None:
+                root = _etree.parse(file).getroot()
+        except Exception:
+            raise _etree.SchematronParseError(
+                "No tree or file given: %s" % sys.exc_info()[1])
+        if root is None:
+            raise ValueError("Empty tree")
+        if root.tag == _schematron_root:
+            schematron = root
+        else:
+            schematron = self._extract(root)
+        if schematron is None:
+            raise _etree.SchematronParseError(
+                "Document is not a schematron schema or schematron-extractable")
+        # perform the iso-schematron skeleton implementation steps to get a
+        # validating xslt
+        if include:
+            schematron = self._include(schematron, **include_params)
+        if expand:
+            schematron = self._expand(schematron, **expand_params)
+        if validate_schema and not schematron_schema_valid(schematron):
+            raise _etree.SchematronParseError(
+                "invalid schematron schema: %s" %
+                schematron_schema_valid.error_log)
+        if store_schematron:
+            self._schematron = schematron
+        # add new compile keyword args here if exposing them
+        compile_kwargs = {'phase': phase}
+        compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
+        validator_xslt = self._compile(schematron, **compile_params)
+        if store_xslt:
+            self._validator_xslt = validator_xslt
+        self._validator = _etree.XSLT(validator_xslt)
+    def __call__(self, etree):
+        """Validate doc using Schematron.
+        Returns true if document is valid, false if not.
+        """
+        self._clear_error_log()
+        result = self._validator(etree)
+        if self._store_report:
+            self._validation_report = result
+        errors = self._validation_errors(result)
+        if errors:
+            if _etree.iselement(etree):
+                fname = etree.getroottree().docinfo.URL or '<file>'
+            else:
+                fname = etree.docinfo.URL or '<file>'
+            for error in errors:
+                # Does svrl report the line number, anywhere? Don't think so.
+                self._append_log_message(
+                    domain=self._domain, type=self._error_type,
+                    level=self._level, line=0,
+                    message=_etree.tostring(error, encoding='unicode'),
+                    filename=fname)
+            return False
+        return True
+    @property
+    def schematron(self):
+        """ISO-schematron schema document (None if object has been initialized
+        with store_schematron=False).
+        """
+        return self._schematron
+    @property
+    def validator_xslt(self):
+        """ISO-schematron skeleton implementation XSLT validator document (None
+        if object has been initialized with store_xslt=False).
+        """
+        return self._validator_xslt
+    @property
+    def validation_report(self):
+        """ISO-schematron validation result report (None if result-storing has
+        been turned off).
+        """
+        return self._validation_report

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (3.8 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/__pycache__/compiler.cpython-312.pyc ADDED Viewed

Binary file (4.62 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/__pycache__/driver.cpython-312.pyc ADDED Viewed

Binary file (3.69 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/compiler.py ADDED Viewed

	@@ -0,0 +1,495 @@

+from triton.backends.compiler import BaseBackend, GPUTarget, Language
+from triton._C.libtriton import ir, passes, llvm, amd
+from triton import knobs
+from dataclasses import dataclass
+from typing import Any, Dict, Tuple
+from types import ModuleType
+import hashlib
+import tempfile
+import re
+import functools
+import warnings
+from pathlib import Path
+def get_min_dot_size(target: GPUTarget):
+    # We fallback to use FMA and cast arguments if certain configurations is
+    # not supported natively by matrix core units.
+    return lambda lhs_type, rhs_type: (1, 1, 1)
+def is_pingpong_schedule_enabled(arch, use_async_copy):
+    return (arch == "gfx942" or (arch == "gfx950" and use_async_copy is True)
+            ) if knobs.amd.use_block_pingpong is None else knobs.amd.use_block_pingpong
+def is_in_thread_transpose_enabled(arch):
+    return (arch == "gfx942") if knobs.amd.use_in_thread_transpose is None else knobs.amd.use_in_thread_transpose
+@dataclass(frozen=True)
+class HIPOptions:
+    num_warps: int = 4
+    waves_per_eu: int = 0
+    num_stages: int = 2
+    num_ctas: int = 1
+    extern_libs: dict = None
+    debug: bool = False
+    sanitize_overflow: bool = True
+    arch: str = None
+    # We have native support for OCP fp8 variants since CDNA4/RDNA4. For earlier generations,
+    # we software emulate the support for them.
+    # UZ fp8 variants (fp8e4b8 and fp8e5b16) are natively supported for CDNA3. For other
+    # architectures they are software emulated.
+    supported_fp8_dtypes: Tuple[str] = ("fp8e4nv", "fp8e5", "fp8e5b16", "fp8e4b8")
+    deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
+    default_dot_input_precision: str = "ieee"
+    allowed_dot_input_precisions: Tuple[str] = ("ieee", 'bf16x3', 'bf16x6')
+    enable_fp_fusion: bool = True
+    launch_cooperative_grid: bool = False
+    matrix_instr_nonkdim: int = 0
+    kpack: int = 1
+    allow_flush_denorm: bool = False
+    max_num_imprecise_acc_default: int = 0
+    backend_name: str = 'hip'
+    instrumentation_mode: str = ""
+    # The following option provides hints to the AMDGPU backend regarding instruction scheduling
+    # for all `tt.dot` operations in a kernel. The "none" variant preserves the default
+    # instruction scheduling of the AMDGPU backend which aims at maximizing occupancy.
+    # The option is experimental and may change at any time regarding its semantics and/or may
+    # be gone entirely anytime.
+    #
+    # Current experimental scheduling variants:
+    #
+    # attention: enables a bunch of optimizations for attention kernels, including:
+    #            - iglp 2 and sched.barrier around it
+    #            - sink-insts-to-avoid-spills flag to avoid register spills
+    # memory-bound-attention: enables custom scheduling strategy in llvm backend,
+    #            This option targets special FA variant, which is memory bound and
+    #            has a lot of elementwise operations from fused operand dequantizations.
+    #            Note that this option is highly experimental,
+    #            and will be removed as soon as default sceduler algorithm is fixed.
+    #
+    # Option allows to set multiple variants divided by commas:
+    # schedule_hint="attention,memory-bound-attention"
+    schedule_hint: str = 'none'
+    def __post_init__(self):
+        gfx_major = int(self.arch[3:-2])  # Drop "gfx" prefix and minor/patch number
+        warp_size = 32 if gfx_major >= 10 else 64
+        object.__setattr__(self, 'warp_size', warp_size)
+        assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
+            "num_warps must be a power of 2"
+        if (self.arch == 'gfx950') and (self.kpack != 1):
+            warnings.warn(
+                f"kpack is deprecated starting from gfx950 and will be removed in later releases. So for now kpack = {self.kpack} will be overwritten to 1 to make transitioning easier."
+            )
+            object.__setattr__(self, 'kpack', 1)
+        default_libdir = Path(__file__).parent / 'lib'
+        extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
+        for lib in ["ocml", "ockl"]:
+            extern_libs[lib] = str(default_libdir / f'{lib}.bc')
+        object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
+    def hash(self):
+        key = '_'.join([f'{name}-{val}' for name, val in self.__dict__.items()])
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+class HIPBackend(BaseBackend):
+    instrumentation = None
+    supports_native_tensor_specialization = False
+    @staticmethod
+    def supports_target(target: GPUTarget):
+        return target.backend == 'hip'
+    def __init__(self, target: GPUTarget) -> None:
+        super().__init__(target)
+        assert isinstance(target.arch, str)
+        self.binary_ext = "hsaco"
+    def get_target_name(self, options) -> str:
+        return f"hip:{options.arch}"
+    def parse_options(self, opts) -> Any:
+        args = {'arch': knobs.runtime.override_arch or self.target.arch}
+        if opts.get("num_ctas", 1) > 1 and not amd.supports_multi_cta_launch(self.target.arch):
+            raise ValueError(f"num_ctas > 1 not supported on {self.target.arch}")
+        # Enable XF32 (TF32) for CDNA3 GPUs
+        if self.target.arch == 'gfx942':
+            allowed_dot_input_precisions = set(HIPOptions.allowed_dot_input_precisions)
+            allowed_dot_input_precisions.update({'tf32'})
+            args["allowed_dot_input_precisions"] = tuple(sorted(allowed_dot_input_precisions))
+        if "supported_fp8_dtypes" not in opts:
+            args["supported_fp8_dtypes"] = tuple(sorted(HIPOptions.supported_fp8_dtypes))
+        if self.target.arch == 'gfx950':
+            deprecated_fp8_dot_operand_dtypes = set(HIPOptions.deprecated_fp8_dot_operand_dtypes)
+            deprecated_fp8_dot_operand_dtypes.update({"fp8e5b16", "fp8e4b8"})
+            args["deprecated_fp8_dot_operand_dtypes"] = tuple(sorted(deprecated_fp8_dot_operand_dtypes))
+        if "enable_fp_fusion" not in opts:
+            args["enable_fp_fusion"] = knobs.language.default_fp_fusion
+        args.update({k: opts[k] for k in HIPOptions.__dataclass_fields__.keys() if k in opts and opts[k] is not None})
+        return HIPOptions(**args)
+    def pack_metadata(self, metadata):
+        return (
+            metadata.num_warps,
+            metadata.num_ctas,
+            metadata.shared,
+        )
+    def get_codegen_implementation(self, options):
+        return {"min_dot_size": get_min_dot_size(self.target)}
+    def get_module_map(self) -> Dict[str, ModuleType]:
+        from triton.language.extra.hip import libdevice
+        return {"triton.language.extra.libdevice": libdevice}
+    def load_dialects(self, ctx):
+        amd.load_dialects(ctx)
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.load_dialects(ctx)
+    @staticmethod
+    def is_within_2gb(arg):
+        import torch
+        MAX_INT_32 = 2**31 - 1
+        if hasattr(arg, "ptr_range"):
+            return arg.ptr_range() <= MAX_INT_32
+        if isinstance(arg, torch.Tensor) and hasattr(arg, "untyped_storage"):
+            return arg.untyped_storage().size() <= MAX_INT_32
+        return False
+    @staticmethod
+    def parse_attr(desc):
+        ret = BaseBackend.parse_attr(desc)
+        if "S" in desc:
+            ret += [["tt.pointer_range", 32]]
+        return ret
+    @staticmethod
+    def get_tensor_specialization(arg, **kwargs):
+        ret = BaseBackend.get_tensor_specialization(arg, **kwargs)
+        if knobs.amd.use_buffer_ops and HIPBackend.is_within_2gb(arg):
+            ret += "S"
+        return ret
+    @staticmethod
+    def make_ttir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.common.add_inliner(pm)
+        passes.ttir.add_rewrite_tensor_pointer(pm)
+        passes.ttir.add_rewrite_tensor_descriptor_to_pointer(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_combine(pm)
+        passes.ttir.add_reorder_broadcast(pm)
+        passes.common.add_cse(pm)
+        passes.ttir.add_triton_licm(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.ttir.add_loop_unroll(pm)
+        pm.run(mod, 'make_ttir')
+        return mod
+    @staticmethod
+    def make_ttgir(mod, metadata, options):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttir.add_convert_to_ttgpuir(pm, f"hip:{options.arch}", options.num_warps, options.warp_size,
+                                           options.num_ctas)
+        pm.run(mod, 'make_ttgir_early')
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        emuTF32 = False
+        passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_f32_dot_tc(pm, emuTF32)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_thread_locality(pm)
+        amd.passes.ttgpuir.add_accelerate_matmul(pm, options.arch, options.matrix_instr_nonkdim, options.kpack)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        amd.passes.ttgpuir.add_optimize_epilogue(pm)
+        amd.passes.ttgpuir.add_optimize_dot_operands(pm, options.arch)
+        amd.passes.ttgpuir.add_hoist_layout_conversions(pm)
+        passes.ttgpuir.add_fuse_nested_loops(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_triton_licm(pm)
+        passes.common.add_canonicalizer(pm)
+        use_async_copy = knobs.amd.use_async_copy
+        use_block_pingpong = is_pingpong_schedule_enabled(options.arch, use_async_copy)
+        amd.passes.ttgpuir.add_schedule_loops(pm, options.num_stages)
+        amd.passes.ttgpuir.add_pipeline(pm, use_async_copy, use_block_pingpong)
+        if use_async_copy:
+            amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
+        passes.common.add_canonicalizer(pm)
+        if options.schedule_hint.lower() != "none":
+            for hint in options.schedule_hint.split(","):
+                amd.passes.ttgpuir.insert_instruction_sched_hints(pm, hint)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_reduce_data_duplication(pm)
+        if is_in_thread_transpose_enabled(options.arch):
+            amd.passes.ttgpuir.add_in_thread_transpose(pm)
+            passes.ttgpuir.add_remove_layout_conversions(pm)
+        amd.passes.ttgpuir.add_reorder_instructions(pm)
+        if use_block_pingpong and options.num_stages > 1:
+            amd.passes.ttgpuir.add_block_pingpong(pm, options.num_stages)
+        if knobs.amd.use_buffer_ops:
+            amd.passes.ttgpuir.add_canonicalize_pointers(pm)
+            passes.common.add_canonicalizer(pm)
+            amd.passes.ttgpuir.add_convert_to_buffer_ops(
+                pm,
+                options.arch,
+                knobs.amd.use_buffer_atomics,
+                knobs.amd.buffer_ops_analyze_small_tensor_range,
+            )
+        amd.passes.ttgpuir.add_fold_true_cmpi(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        pm.run(mod, 'make_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+    @staticmethod
+    def gluon_to_ttgir(src, metadata, options):
+        mod = src
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.gluon.add_inliner(pm)
+        passes.gluon.add_resolve_auto_encodings(pm)
+        passes.common.add_sccp(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.gluon.add_canonicalizer(pm)
+        passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+        pm.run(mod, 'gluon_to_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+    @staticmethod
+    def make_llir(src, metadata, options):
+        mod = src
+        # TritonGPU -> LLVM-IR (MLIR)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        amd.passes.ttgpuir.add_update_async_wait_count(pm, options.arch)
+        # custom_lds_size is an experimental parameter that defines amount of LDS available
+        # for one thread block. Measured in bytes.
+        #
+        # If custom_lds_size = 0, pass will consider all LDS is available for one threads block,
+        # LDS size is determined by provided arch name.
+        custom_lds_size = 0
+        amd.passes.ttgpuir.add_optimize_lds_usage(pm, options.arch, custom_lds_size)
+        passes.convert.add_scf_to_cf(pm)
+        passes.gluon.add_inliner(pm)
+        passes.convert.add_index_to_llvmir(pm)
+        amd.passes.ttgpuir.add_allocate_shared_memory(pm)
+        # instrumentation point here so we can override IRs above (e.g., ttir and ttgir)
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.patch("ttgpuir_to_llvmir", pm, mod.context)
+        ## __HIP_FTZ is used to control the denorm flushing behavior of exp2 op as follows:
+        ## 1. If __HIP_FTZ = 1, exp2 flushes denorms in input and output regardless
+        ##    of the value of kernel arg `allow_flush_denorm`.
+        ## 2. If __HIP_FTZ = 0, whether exp2 flushes denorms in input and output
+        ##    depends on the value of kernel arg `allow_flush_denorm`.
+        ## 3. __HIP_FTZ is default to 1 and not exposed as a kernel argument.
+        ##    For now it is used as a controller for developers only.
+        __HIP_FTZ = True
+        amd.passes.ttgpuir.add_to_llvmir(pm, options.arch, __HIP_FTZ)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.convert.add_cf_to_llvmir(pm)
+        passes.convert.add_arith_to_llvmir(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        if options.schedule_hint.lower() != "none":
+            amd.passes.ttgpuir.lower_instruction_sched_hints(pm, options.arch, options.num_stages)
+        # This can not be moved below the di_scope pass
+        if HIPBackend.instrumentation:
+            HIPBackend.instrumentation.patch("llvmir_to_llvm", pm, mod.context)
+        if not knobs.compilation.disable_line_info and not knobs.compilation.dump_ir_extract_di_local_variables:
+            passes.llvmir.add_di_scope(pm)
+        amd.passes.ttgpuir.add_builtin_func_to_llvmir(pm, __HIP_FTZ)
+        pm.run(mod, 'make_llir')
+        if knobs.compilation.dump_ir_extract_di_local_variables:
+            # comments below on why separate it
+            if not knobs.compilation.disable_line_info:
+                pm = ir.pass_manager(mod.context)
+                pm.enable_debug()
+                passes.llvmir.add_di_scope(pm)
+                pm.run(mod, 'make_llir.disable_line_info')
+            # insert dbg intrinsic with several DI Attribute including source
+            # var name and type info note: unknown reason for now, but this
+            # pass and add_di_scope has to be run separately, otherwise if we
+            # put them into previous pipline, it trigger a segmentfault without
+            # any error message; could be due to a bug in mlir or pybind11
+            pm = ir.pass_manager(mod.context)
+            pm.enable_debug()
+            passes.llvmir.add_di_local_variable(pm)
+            pm.run(mod, 'make_llir.dump_ir_extract_di_local_variables')
+        # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
+        llvm.init_targets()
+        context = llvm.context()
+        llvm_mod = llvm.to_module(mod, context)
+        amd.attach_target_triple(llvm_mod)
+        target_features = ''
+        if knobs.compilation.enable_asan:
+            target_features = '+xnack'
+        llvm.attach_datalayout(llvm_mod, amd.TARGET_TRIPLE, options.arch, target_features)
+        # Set various control constants on the LLVM module so that device
+        # libraries can resolve references to them.
+        amd.set_isa_version(llvm_mod, options.arch)
+        amd.set_abi_version(llvm_mod, 500)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_finite_only_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_correctly_rounded_sqrt32", True)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_unsafe_math_opt", False)
+        amd.set_bool_control_constant(llvm_mod, "__oclc_wavefrontsize64", options.warp_size == 64)
+        # Set kernel attributes first given this may affect later optimizations.
+        fns = [fn for fn in llvm_mod.get_functions() if not fn.is_declaration()]
+        # The public kernel should be kernel 0.
+        fns[0].set_calling_conv(amd.CALLING_CONV_AMDGPU_KERNEL)
+        fns[0].add_fn_attr("amdgpu-flat-work-group-size", f"1,{options.num_warps*options.warp_size}")
+        if "memory-bound-attention" in options.schedule_hint.split(','):
+            fns[0].add_fn_attr("amdgpu-sched-strategy", "iterative-ilp")
+        fns[0].add_fn_attr("uniform-work-group-size", "true")
+        # LLVM AMDGPU backend supports the attribute "amdgpu-waves-per-eu"="<min>[, <max>]".
+        # This attribute may be attached to a kernel function definition and is an optimization hint.
+        # <min> parameter specifies the requested minimum number of waves per EU, and optional <max> parameter
+        # specifies the requested maximum number of waves per EU (must be >= <min> if specified).
+        # If <max> is omitted, then there is no restriction on the maximum number of waves per EU other than
+        # the one dictated by the hardware for which the kernel is compiled. Passing 0, 0 as <min>, <max>
+        # implies the default behavior (no limits).
+        # Specifying N, N forces LLVM to focus on a single register count, simplifies some heuristics
+        # and may improve scheduling.
+        fns[0].add_fn_attr("amdgpu-waves-per-eu", f"{options.waves_per_eu}, {options.waves_per_eu}")
+        denormal_mode = "preserve-sign" if options.allow_flush_denorm else "ieee"
+        fns[0].add_fn_attr("denormal-fp-math-f32", denormal_mode)
+        if knobs.compilation.enable_asan:
+            fns[0].add_fn_target_feature("+xnack")
+            fns[0].add_fn_asan_attr()
+        # Hint the compiler that we'd like the firmware to set the kernel arguments
+        # to user SGPRs so that the kernel does not need to s_load its arguments
+        # from memory.
+        amd.set_all_fn_arg_inreg(fns[0])
+        if knobs.compilation.enable_asan:
+            default_libdir = Path(__file__).parent / 'lib'
+            paths = [
+                str(default_libdir / 'asanrtl.bc'),
+                str(default_libdir / "ocml.bc"),
+                str(default_libdir / "ockl.bc")
+            ]
+            llvm.link_extern_libs(llvm_mod, paths)
+        elif options.extern_libs:
+            paths = [path for (name, path) in options.extern_libs if amd.need_extern_lib(llvm_mod, name)]
+            if len(paths) > 0:
+                llvm.link_extern_libs(llvm_mod, paths)
+        llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3, options.arch, '', [], options.enable_fp_fusion)
+        # Architectures with architected SGPRs store the workgroup id in ttmp9 (X) and ttmp7 (Y[15:0], Z[31:16]).
+        # These attributes are used to determine if Z should be masked out when loading Y. They are inferred during
+        # optimize_module from calls to @llvm.amdgcn.workgroup.id.x/y/z(). We cannot rely on this because a
+        # dispatch dimensions might be used even if there is no program_id() call for it.
+        if amd.has_architected_sgprs(options.arch):
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-x")
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-y")
+            fns[0].remove_fn_attr("amdgpu-no-workgroup-id-z")
+        if knobs.amd.scalarize_packed_fops:
+            amd.add_scalarize_packed_fops_llvm_pass(fns[0])
+        # Get some metadata
+        metadata["shared"] = src.get_int_attr("ttg.shared")
+        metadata["profile_scratch_size"] = src.get_int_attr("ttg.profile_scratch_memory_size") or 0
+        metadata["profile_scratch_align"] = src.get_int_attr("ttg.profile_scratch_memory_alignment") or 1
+        amd.cleanup_bitcode_metadata(llvm_mod)
+        # Disable inlining of print related functions,
+        # because inlining of these function could slow down compilation significantly
+        amd.disable_print_inline(llvm_mod)
+        return str(llvm_mod)
+    @staticmethod
+    def make_amdgcn(src, metadata, options):
+        # Find kernel names (there should only be one)
+        # We get the name at the last possible step to accommodate `triton.compile`
+        # on user-provided LLVM
+        names = re.findall(r"define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)", src)
+        assert len(names) == 1
+        metadata["name"] = names[0]
+        # llvm -> hsaco
+        flags = []
+        features = '-real-true16' if 'gfx11' in options.arch else ''
+        ir_hash = hashlib.sha256(src.encode("utf-8")).hexdigest()
+        dump_file_id = names[0] + '_' + ir_hash
+        _ = llvm.translate_to_mir(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                                  dump_file_id)
+        llvm.dump_sched_dag(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                            dump_file_id)
+        amdgcn = llvm.translate_to_asm(src, amd.TARGET_TRIPLE, options.arch, features, flags, options.enable_fp_fusion,
+                                       False)
+        if knobs.amd.dump_amdgcn:
+            print("// -----// AMDGCN Dump //----- //")
+            print(amdgcn)
+        return amdgcn
+    @staticmethod
+    def make_hsaco(src, metadata, options):
+        target_features = ''
+        if knobs.compilation.enable_asan:
+            target_features = '+xnack'
+        hsaco = amd.assemble_amdgcn(src, options.arch, target_features)
+        with tempfile.NamedTemporaryFile() as tmp_out:
+            with tempfile.NamedTemporaryFile() as tmp_in:
+                with open(tmp_in.name, "wb") as fd_in:
+                    fd_in.write(hsaco)
+                amd.link_hsaco(tmp_in.name, tmp_out.name)
+            with open(tmp_out.name, "rb") as fd_out:
+                ret = fd_out.read()
+        return ret
+    def add_stages(self, stages, options, language):
+        if language == Language.TRITON:
+            stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
+            stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options)
+        elif language == Language.GLUON:
+            stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options)
+        stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
+        stages["amdgcn"] = lambda src, metadata: self.make_amdgcn(src, metadata, options)
+        stages["hsaco"] = lambda src, metadata: self.make_hsaco(src, metadata, options)
+        if knobs.runtime.add_stages_inspection_hook is not None:
+            knobs.runtime.add_stages_inspection_hook(self, stages, options, language, None)
+    @functools.lru_cache()
+    def hash(self):
+        return f'{self.target}'

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/driver.c ADDED Viewed

	@@ -0,0 +1,504 @@

+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+typedef struct {
+  uint32_t group0_0;
+  uint32_t group0_1;
+  uint32_t group0_2;
+  uint32_t group0_3;
+  uint32_t group1_0;
+  uint32_t group1_1;
+  uint32_t group1_2;
+  uint32_t group1_3;
+  uint32_t group1_4;
+  uint32_t group1_5;
+  uint32_t group1_6;
+  uint32_t group1_7;
+} TDMDescriptor;
+typedef struct {
+  PyObject_HEAD;
+  TDMDescriptor desc;
+} PyTDMDescriptorObject;
+static PyObject *PyTDMDescriptor_new(PyTypeObject *type, PyObject *args,
+                                     PyObject *kw) {
+  PyTDMDescriptorObject *self =
+      (PyTDMDescriptorObject *)type->tp_alloc(type, 0);
+  if (!self)
+    return NULL;
+  memset(&self->desc, 0, sizeof(self->desc));
+  return (PyObject *)self;
+}
+static void PyTDMDescriptor_dealloc(PyTDMDescriptorObject *self) {
+  Py_TYPE(self)->tp_free((PyObject *)self);
+}
+static PyTypeObject PyTDMDescriptorType = {
+    PyVarObject_HEAD_INIT(NULL, 0).tp_name =
+        "triton.backends.amd.PyTDMDescriptor",
+    .tp_basicsize = sizeof(PyTDMDescriptorObject),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_doc = "PyObject for TDMDescriptor",
+    .tp_new = PyTDMDescriptor_new,
+    .tp_dealloc = (destructor)PyTDMDescriptor_dealloc,
+};
+// TODO: Both host-side and device-side TDM descriptor follow the same encoding
+// format. Consider to add a common utility to remove duplicate code.
+static bool encodeTDMDescriptor(TDMDescriptor *desc, int elementBitWidth,
+                                uint32_t *blockSize, int numWarps,
+                                int padInterval, int padAmount, uint32_t *shape,
+                                uint32_t *strides, uint64_t globalAddress,
+                                int rank) {
+  // NYI: TDM > 2D cases
+  if (rank != 2)
+    return false;
+  // Get warp distribution
+  uint32_t numWarpsDim0 = numWarps;
+  for (; numWarpsDim0 > blockSize[0]; numWarpsDim0 /= 2)
+    ;
+  uint32_t numWarpsDim1 = numWarps / numWarpsDim0;
+  if (!(numWarpsDim0 > 0 && blockSize[1] % numWarpsDim1 == 0))
+    return false;
+  uint32_t blockSize0 = (blockSize[0] + numWarpsDim0 - 1) / numWarpsDim0;
+  uint32_t blockSize1 = (blockSize[1] + numWarpsDim1 - 1) / numWarpsDim1;
+  // group0 (128 bits / 4 dwords) effective bit encoding:
+  // [120:64]:  global address
+  // [127:126]: type - currently always set to 0x2
+  desc->group0_2 = (uint32_t)(globalAddress & 0xFFFFFFFF);
+  desc->group0_3 = (uint32_t)((globalAddress >> 32) & 0x01FFFFFF);
+  desc->group0_3 |= (0x1 << 31);
+  // group1 (256 bits / 8 dwords) effective bit encoding:
+  // [17:16]:   data size - log2(element size in bytes)
+  // [20]:      enable padding
+  // [24:22]:   pad interval - log2(pad interval in dwords) - 1
+  // [31:25]:   pad amount - pad amount in dwords - 1
+  // [79:48]:   tensor shape dim inner
+  // [111:80]:  tensor shape dim outer
+  // [127:112]: block shape dim inner
+  // [143:128]: block shape dim outer
+  // [207:160]: tensor stride dim outer (we only use 32 bits)
+  int elementSizeInBytes = elementBitWidth / 8;
+  int dataSize = log2(elementSizeInBytes);
+  desc->group1_0 = (dataSize << 16);
+  int dwordSize = 32;
+  int padIntervalInDwords = padInterval * elementBitWidth / dwordSize;
+  int padAmountInDwords = padAmount * elementBitWidth / dwordSize;
+  if (padIntervalInDwords > 0 && padAmountInDwords > 0) {
+    int log2PadInterval = log2(padIntervalInDwords);
+    desc->group1_0 |= (1 << 20);
+    desc->group1_0 |= ((log2PadInterval - 1) << 22);
+    desc->group1_0 |= ((padAmountInDwords - 1) << 25);
+  }
+  desc->group1_1 = (shape[1] << 16);
+  desc->group1_2 = (shape[1] >> 16);
+  desc->group1_2 |= (shape[0] << 16);
+  desc->group1_3 = (shape[0] >> 16);
+  desc->group1_3 |= (blockSize1 << 16);
+  desc->group1_4 = (blockSize0 & 0xFFFF);
+  desc->group1_5 = strides[0];
+  return true;
+}
+// The list of paths to search for the HIP runtime library. The caller Python
+// code should substitute the search path placeholder.
+static const char *hipLibSearchPaths[] = {"/*py_libhip_search_path*/"};
+// The list of HIP dynamic library symbols and their signature we are interested
+// in this file.
+// |FOR_EACH_ERR_FN| is a macro to process APIs that return hipError_t;
+// |FOR_EACH_STR_FN| is a macro to process APIs that return const char *.
+#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                      \
+  FOR_EACH_STR_FN(hipGetErrorString, hipError_t hipError)                      \
+  FOR_EACH_ERR_FN(hipGetDeviceProperties, hipDeviceProp_t *prop, int deviceId) \
+  FOR_EACH_ERR_FN(hipModuleLoadDataEx, hipModule_t *module, const void *image, \
+                  unsigned int numOptions, hipJitOption *options,              \
+                  void **optionValues)                                         \
+  FOR_EACH_ERR_FN(hipModuleGetFunction, hipFunction_t *function,               \
+                  hipModule_t module, const char *kname)                       \
+  FOR_EACH_ERR_FN(hipFuncGetAttribute, int *, hipFunction_attribute attr,      \
+                  hipFunction_t function)
+// HIP driver version format: HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR *
+// 100000 + HIP_VERSION_PATCH.
+#define TRITON_HIP_DRIVER_EXTRACT_MAJOR_VERSION(version) ((version) / 10000000)
+#define TRITON_HIP_DRIVER_EXTRACT_MINOR_VERSION(version)                       \
+  (((version) % 10000000) / 100000)
+#define TRITON_HIP_DRIVER_EXTRACT_PATCH_VERSION(version) ((version) % 100000)
+#define TRITON_HIP_DRIVER_REQ_MAJOR_VERSION (6)
+// #define TRITON_HIP_DRIVER_DBG_VERSION
+#ifdef TRITON_HIP_DRIVER_DBG_VERSION
+#define TRITON_HIP_DRIVER_LOG_VERSION(version, msgBuff)                        \
+  do {                                                                         \
+    snprintf(msgBuff, sizeof(msgBuff), "libamdhip64 version is: %d.%d.%d",     \
+             TRITON_HIP_DRIVER_EXTRACT_MAJOR_VERSION(version),                 \
+             TRITON_HIP_DRIVER_EXTRACT_MINOR_VERSION(version),                 \
+             TRITON_HIP_DRIVER_EXTRACT_PATCH_VERSION(version));                \
+    printf("%s\n", msgBuff);                                                   \
+  } while (0);
+#else
+#define TRITON_HIP_DRIVER_LOG_VERSION(version, msgBuff)                        \
+  do {                                                                         \
+    (void)msgBuff;                                                             \
+    (void)(version);                                                           \
+  } while (0);
+#endif
+#define TRITON_HIP_MSG_BUFF_SIZE (1024U)
+// The HIP symbol table for holding resolved dynamic library symbols.
+struct HIPSymbolTable {
+#define DEFINE_EACH_ERR_FIELD(hipSymbolName, ...)                              \
+  hipError_t (*hipSymbolName)(__VA_ARGS__);
+#define DEFINE_EACH_STR_FIELD(hipSymbolName, ...)                              \
+  const char *(*hipSymbolName)(__VA_ARGS__);
+  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
+};
+static struct HIPSymbolTable hipSymbolTable;
+static int checkDriverVersion(void *lib) {
+  int hipVersion = -1;
+  const char *error = NULL;
+  typedef hipError_t (*hipDriverGetVersion_fn)(int *driverVersion);
+  hipDriverGetVersion_fn hipDriverGetVersion;
+  dlerror(); // Clear existing errors
+  hipDriverGetVersion =
+      (hipDriverGetVersion_fn)dlsym(lib, "hipDriverGetVersion");
+  error = dlerror();
+  if (error) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "cannot query 'hipDriverGetVersion' from libamdhip64.so");
+    dlclose(lib);
+    return -1;
+  }
+  (void)hipDriverGetVersion(&hipVersion);
+  char msgBuff[TRITON_HIP_MSG_BUFF_SIZE] = {0};
+  const int hipMajVersion = TRITON_HIP_DRIVER_EXTRACT_MAJOR_VERSION(hipVersion);
+  if (hipMajVersion < TRITON_HIP_DRIVER_REQ_MAJOR_VERSION) {
+    const int hipMinVersion =
+        TRITON_HIP_DRIVER_EXTRACT_MINOR_VERSION(hipVersion);
+    const int hipPatchVersion =
+        TRITON_HIP_DRIVER_EXTRACT_PATCH_VERSION(hipVersion);
+    snprintf(msgBuff, sizeof(msgBuff),
+             "libamdhip64 version %d.%d.%d is not supported! Required major "
+             "version is >=%d.",
+             hipMajVersion, hipMinVersion, hipPatchVersion,
+             TRITON_HIP_DRIVER_REQ_MAJOR_VERSION);
+    PyErr_SetString(PyExc_RuntimeError, msgBuff);
+    dlclose(lib);
+    return -1;
+  }
+  TRITON_HIP_DRIVER_LOG_VERSION(hipVersion, msgBuff);
+  return hipVersion;
+}
+bool initSymbolTable() {
+  void *lib;
+  // Go through the list of search paths to dlopen the first HIP driver library.
+  int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
+  for (int i = 0; i < n; ++i) {
+    void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
+    if (handle) {
+      lib = handle;
+      // printf("[triton] chosen %s\n", hipLibSearchPaths[i]);
+    }
+  }
+  if (!lib) {
+    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
+    return false;
+  }
+  int hipVersion = checkDriverVersion(lib);
+  if (hipVersion == -1)
+    return false;
+  const char *error = NULL;
+  typedef hipError_t (*hipGetProcAddress_fn)(
+      const char *symbol, void **pfn, int hipVersion, uint64_t hipFlags,
+      hipDriverProcAddressQueryResult *symbolStatus);
+  hipGetProcAddress_fn hipGetProcAddress;
+  dlerror(); // Clear existing errors
+  *(void **)&hipGetProcAddress = dlsym(lib, "hipGetProcAddress");
+  error = dlerror();
+  if (error) {
+    PyErr_SetString(PyExc_RuntimeError,
+                    "cannot query 'hipGetProcAddress' from libamdhip64.so");
+    dlclose(lib);
+    return false;
+  }
+  // Resolve all symbols we are interested in.
+  uint64_t hipFlags = 0;
+  hipDriverProcAddressQueryResult symbolStatus;
+  hipError_t status = hipSuccess;
+#define QUERY_EACH_FN(hipSymbolName, ...)                                      \
+  status = hipGetProcAddress(#hipSymbolName,                                   \
+                             (void **)&hipSymbolTable.hipSymbolName,           \
+                             hipVersion, hipFlags, &symbolStatus);             \
+  if (status != hipSuccess) {                                                  \
+    PyErr_SetString(PyExc_RuntimeError,                                        \
+                    "cannot get address for '" #hipSymbolName                  \
+                    "' from libamdhip64.so");                                  \
+    dlclose(lib);                                                              \
+    return false;                                                              \
+  }
+  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
+  return true;
+}
+static inline void gpuAssert(hipError_t code, const char *file, int line) {
+  {
+    if (code != HIP_SUCCESS) {
+      {
+        const char *prefix = "Triton Error [HIP]: ";
+        const char *str = hipSymbolTable.hipGetErrorString(code);
+        char err[TRITON_HIP_MSG_BUFF_SIZE] = {0};
+        snprintf(err, sizeof(err), "%s Code: %d, Messsage: %s", prefix, code,
+                 str);
+        PyGILState_STATE gil_state;
+        gil_state = PyGILState_Ensure();
+        PyErr_SetString(PyExc_RuntimeError, err);
+        PyGILState_Release(gil_state);
+      }
+    }
+  }
+}
+#define HIP_CHECK(ans)                                                         \
+  {                                                                            \
+    gpuAssert((ans), __FILE__, __LINE__);                                      \
+    if (PyErr_Occurred())                                                      \
+      return NULL;                                                             \
+  }
+static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
+  int device_id;
+  if (!PyArg_ParseTuple(args, "i", &device_id))
+    return NULL;
+  hipDeviceProp_t props;
+  HIP_CHECK(hipSymbolTable.hipGetDeviceProperties(&props, device_id));
+  // create a struct to hold device properties
+  return Py_BuildValue(
+      "{s:i, s:i, s:i, s:i, s:i, s:i, s:s, s:i, s:i}", "max_shared_mem",
+      props.sharedMemPerBlock, "max_num_regs", props.regsPerBlock,
+      "multiprocessor_count", props.multiProcessorCount, "sm_clock_rate",
+      props.clockRate, "mem_clock_rate", props.memoryClockRate, "mem_bus_width",
+      props.memoryBusWidth, "arch", props.gcnArchName, "warpSize",
+      props.warpSize, "max_threads_per_sm", props.maxThreadsPerMultiProcessor);
+}
+static PyObject *loadBinary(PyObject *self, PyObject *args) {
+  const char *name;
+  const char *data;
+  Py_ssize_t data_size;
+  int shared;
+  int device;
+  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
+                        &device)) {
+    return NULL;
+  }
+  // set HIP options
+  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
+                        hipJitOptionErrorLogBuffer,
+                        hipJitOptionInfoLogBufferSizeBytes,
+                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
+  const unsigned int errbufsize = 8192;
+  const unsigned int logbufsize = 8192;
+  char _err[errbufsize];
+  char _log[logbufsize];
+  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
+                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
+  // launch HIP Binary
+  hipModule_t mod;
+  hipFunction_t fun;
+  HIP_CHECK(hipSymbolTable.hipModuleLoadDataEx(&mod, data, 5, opt, optval))
+  HIP_CHECK(hipSymbolTable.hipModuleGetFunction(&fun, mod, name));
+  // get allocated registers and spilled registers from the function
+  int n_regs = 0;
+  int n_spills = 0;
+  int32_t n_max_threads = 0;
+  hipSymbolTable.hipFuncGetAttribute(&n_regs, HIP_FUNC_ATTRIBUTE_NUM_REGS, fun);
+  hipSymbolTable.hipFuncGetAttribute(&n_spills,
+                                     HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
+  hipSymbolTable.hipFuncGetAttribute(
+      &n_max_threads, HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun);
+  n_spills /= 4;
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
+                       n_spills, n_max_threads);
+}
+static PyObject *createTDMDescriptor(PyObject *self, PyObject *args) {
+  int elementBitWidth;
+  PyObject *blockSize;
+  int numWarps;
+  int padInterval;
+  int padAmount;
+  PyObject *shape;
+  PyObject *strides;
+  unsigned long long globalAddress;
+  if (!PyArg_ParseTuple(args, "iOiiiOOK", &elementBitWidth, &blockSize,
+                        &numWarps, &padInterval, &padAmount, &shape, &strides,
+                        &globalAddress)) {
+    return NULL;
+  }
+  PyTDMDescriptorObject *descObj = (PyTDMDescriptorObject *)PyObject_CallObject(
+      (PyObject *)&PyTDMDescriptorType, NULL);
+  if (!descObj)
+    return NULL;
+  PyObject *blockSizeFast = NULL;
+  PyObject *shapeFast = NULL;
+  PyObject *stridesFast = NULL;
+  uint32_t blockSizeInt[2];
+  uint32_t shapeInt[2];
+  uint32_t stridesInt[2];
+  blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
+  if (!blockSizeFast)
+    goto cleanup;
+  int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
+  if (rank != 2) {
+    PyErr_SetString(PyExc_RuntimeError, "rank must be 2");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "block size must be an int");
+      goto cleanup;
+    }
+    blockSizeInt[i] = PyLong_AsLong(item);
+  }
+  shapeFast = PySequence_Fast(shape, "shape must be a sequence");
+  if (!shapeFast)
+    goto cleanup;
+  if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    shapeInt[i] = PyLong_AsLong(item);
+  }
+  stridesFast = PySequence_Fast(strides, "strides must be a sequence");
+  if (!stridesFast)
+    goto cleanup;
+  if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    stridesInt[i] = PyLong_AsLong(item);
+  }
+  Py_DECREF(blockSizeFast);
+  blockSizeFast = NULL;
+  Py_DECREF(shapeFast);
+  shapeFast = NULL;
+  Py_DECREF(stridesFast);
+  stridesFast = NULL;
+  bool success = encodeTDMDescriptor(
+      &descObj->desc, elementBitWidth, blockSizeInt, numWarps, padInterval,
+      padAmount, shapeInt, stridesInt, globalAddress, rank);
+  if (!success) {
+    PyErr_SetString(PyExc_RuntimeError, "Failed to encode TDM descriptor");
+    goto cleanup;
+  }
+  return (PyObject *)descObj;
+cleanup:
+  Py_XDECREF(blockSizeFast);
+  Py_XDECREF(shapeFast);
+  Py_XDECREF(stridesFast);
+  Py_XDECREF(descObj);
+  return NULL;
+}
+static PyMethodDef ModuleMethods[] = {
+    {"load_binary", loadBinary, METH_VARARGS,
+     "Load provided hsaco into HIP driver"},
+    {"get_device_properties", getDeviceProperties, METH_VARARGS,
+     "Get the properties for a given device"},
+    {"create_tdm_descriptor", createTDMDescriptor, METH_VARARGS,
+     "create a host-side TDM descriptor"},
+    {NULL, NULL, 0, NULL} // sentinel
+};
+static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "hip_utils",
+                                       NULL, // documentation
+                                       -1,   // size
+                                       ModuleMethods};
+PyMODINIT_FUNC PyInit_hip_utils(void) {
+  if (!initSymbolTable()) {
+    return NULL;
+  }
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if (m == NULL) {
+    return NULL;
+  }
+  PyModule_AddFunctions(m, ModuleMethods);
+  if (PyType_Ready(&PyTDMDescriptorType) < 0)
+    return NULL;
+  Py_INCREF(&PyTDMDescriptorType);
+  PyModule_AddObject(m, "PyTDMDescriptor", (PyObject *)&PyTDMDescriptorType);
+  return m;
+}

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/amd/driver.py ADDED Viewed

	@@ -0,0 +1,877 @@

+import functools
+import os
+import subprocess
+import re
+import triton
+from pathlib import Path
+from triton import knobs
+from triton.backends.compiler import GPUTarget
+from triton.backends.driver import GPUDriver
+from triton.runtime import _allocation
+from triton.runtime.build import compile_module_from_src
+dirname = os.path.dirname(os.path.realpath(__file__))
+include_dirs = [os.path.join(dirname, "include")]
+PyTDMDescriptor = None
+def _find_already_mmapped_dylib_on_linux(lib_name):
+    import platform
+    if platform.system() != 'Linux':
+        return None
+    # Use dl_iterate_phdr to walk through the list of shared libraries at runtime.
+    # See https://www.man7.org/linux/man-pages/man3/dl_iterate_phdr.3.html for details.
+    import ctypes
+    from ctypes import c_char, c_int, c_size_t, c_void_p, c_char_p, POINTER
+    class DlPhdrInfo(ctypes.Structure):
+        _fields_ = [
+            ('dlpi_addr', c_void_p),
+            ('dlpi_name', c_char_p),
+            # We don't care about the remaining fields.
+        ]
+    # callback_t must use POINTER(c_char) to avoid copying.
+    callback_t = ctypes.CFUNCTYPE(c_int, POINTER(DlPhdrInfo), POINTER(c_size_t), POINTER(c_char))
+    # Load libc and get the dl_iterate_phdr symbol.
+    try:
+        dl_iterate_phdr = ctypes.CDLL('libc.so.6').dl_iterate_phdr
+    except Exception:
+        return None
+    # argtypes must use c_char_p to accept create_string_buffer.
+    dl_iterate_phdr.argtypes = [callback_t, c_char_p]
+    dl_iterate_phdr.restype = c_int
+    max_path_length = 4096
+    path = ctypes.create_string_buffer(max_path_length + 1)
+    # Define callback to get the loaded dylib path.
+    def callback(info, size, data):
+        dlpi_name = info.contents.dlpi_name
+        p = Path(os.fsdecode(dlpi_name))
+        if lib_name in p.name:
+            # Found the dylib; get its path.
+            ctypes.memmove(data, dlpi_name, min(max_path_length, len(dlpi_name)))
+            return 1
+        return 0
+    if dl_iterate_phdr(callback_t(callback), path):
+        return os.fsdecode(ctypes.string_at(path))
+    return None
+@functools.lru_cache()
+def _get_path_to_hip_runtime_dylib():
+    lib_name = "libamdhip64.so"
+    # If we are told explicitly what HIP runtime dynamic library to use, obey that.
+    if env_libhip_path := knobs.amd.libhip_path:
+        if env_libhip_path.endswith(lib_name) and os.path.exists(env_libhip_path):
+            return env_libhip_path
+        raise RuntimeError(f"TRITON_LIBHIP_PATH '{env_libhip_path}' does not point to a valid {lib_name}")
+    # If the shared object is already mmapped to address space, use it.
+    mmapped_path = _find_already_mmapped_dylib_on_linux(lib_name)
+    if mmapped_path:
+        if os.path.exists(mmapped_path):
+            return mmapped_path
+        raise RuntimeError(f"memory mapped '{mmapped_path}' in process does not point to a valid {lib_name}")
+    paths = []
+    # Check backend
+    local_lib = os.path.join(os.path.dirname(__file__), "lib", lib_name)
+    if os.path.exists(local_lib):
+        return local_lib
+    paths.append(local_lib)
+    import site
+    # First search the HIP runtime dynamic library packaged with PyTorch. It's very likely
+    # that we run Triton together with PyTorch. This makes sure we use the same dynamic
+    # library to avoid version mismatch.
+    site_packages = site.getsitepackages()
+    user_site = site.getusersitepackages()
+    if site.ENABLE_USER_SITE:  # ENABLE_USER_SITE is initialized in getusersitepackages()
+        site_packages = [user_site] + site_packages
+    for path in site_packages:
+        path = os.path.join(path, "torch", "lib", lib_name)
+        if os.path.exists(path):
+            return path
+        paths.append(path)
+    # Then try to see if developer provides a HIP runtime dynamic library using LD_LIBARAY_PATH.
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if env_ld_library_path:
+        for d in env_ld_library_path.split(":"):
+            f = os.path.join(d, lib_name)
+            if os.path.exists(f):
+                return f
+            paths.append(f)
+    # HIP_PATH should point to HIP SDK root if set
+    env_hip_path = os.getenv("HIP_PATH")
+    if env_hip_path:
+        hip_lib_path = os.path.join(env_hip_path, "lib", lib_name)
+        if os.path.exists(hip_lib_path):
+            return hip_lib_path
+        paths.append(hip_lib_path)
+    # if available, `hipconfig --path` prints the HIP SDK root
+    try:
+        hip_root = subprocess.check_output(["hipconfig", "--path"]).decode().strip()
+        if hip_root:
+            hip_lib_path = os.path.join(hip_root, "lib", lib_name)
+            if os.path.exists(hip_lib_path):
+                return hip_lib_path
+            paths.append(hip_lib_path)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        # hipconfig may not be available
+        pass
+    # ROCm lib dir based on env var
+    env_rocm_path = os.getenv("ROCM_PATH")
+    if env_rocm_path:
+        rocm_lib_path = os.path.join(env_rocm_path, "lib", lib_name)
+        if os.path.exists(rocm_lib_path):
+            return rocm_lib_path
+        paths.append(rocm_lib_path)
+    # Afterwards try to search the loader dynamic library resolution paths.
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode(errors="ignore")
+    # each line looks like the following:
+    # libamdhip64.so.6 (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so.6
+    # libamdhip64.so (libc6,x86-64) => /opt/rocm-6.0.2/lib/libamdhip64.so
+    locs = [line.split()[-1] for line in libs.splitlines() if line.strip().endswith(lib_name)]
+    for loc in locs:
+        if os.path.exists(loc):
+            return loc
+        paths.append(loc)
+    # As a last resort, guess if we have it in some common installation path.
+    common_install_path = os.path.join('/opt/rocm/lib/', lib_name)
+    if os.path.exists(common_install_path):
+        return common_install_path
+    paths.append(common_install_path)
+    raise RuntimeError(f"cannot locate {lib_name} after attempted paths {paths}")
+class HIPUtils(object):
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(HIPUtils, cls).__new__(cls)
+        return cls.instance
+    def __init__(self):
+        libhip_path = _get_path_to_hip_runtime_dylib()
+        src = Path(os.path.join(dirname, "driver.c")).read_text()
+        # Just do a simple search and replace here instead of templates or format strings.
+        # This way we don't need to escape-quote C code curly brackets and we can replace
+        # exactly once.
+        src = src.replace('/*py_libhip_search_path*/', libhip_path, 1)
+        mod = compile_module_from_src(src=src, name="hip_utils", include_dirs=include_dirs)
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+        self.create_tdm_descriptor = mod.create_tdm_descriptor
+        global PyTDMDescriptor
+        PyTDMDescriptor = mod.PyTDMDescriptor
+# -------------------- Launcher ----------------------------
+def ty_to_cpp(ty):
+    if ty.startswith('*'):
+        return "hipDeviceptr_t"
+    if ty == "tensordesc":
+        return "TDMDescriptor"
+    return {
+        "i1": "int8_t",
+        "i8": "int8_t",
+        "i16": "int16_t",
+        "i32": "int32_t",
+        "i64": "int64_t",
+        "u1": "uint8_t",
+        "u8": "uint8_t",
+        "u16": "uint16_t",
+        "u32": "uint32_t",
+        "u64": "uint64_t",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
+        "fp64": "double",
+    }[ty]
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+_BASE_ARGS_FORMAT = "piiiKKOOOOO"
+def make_launcher(constants, signature, warp_size, tensordesc_meta):
+    def _expand_signature(signature):
+        output = []
+        tensordesc_idx = 0
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                meta = tensordesc_meta[tensordesc_idx] if tensordesc_meta else None
+                tensordesc_idx += 1
+                match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", sig)
+                dtype = match.group(1)
+                shape = match.group(2)
+                ndim = shape.count(",") + 1
+                # If there is no descriptor's metadata, the descriptor has been decomposed to base pointer, shape and strides
+                if meta is None:
+                    output.append("*" + dtype)
+                    for _ in range(2 * ndim):
+                        output.append("i64")
+                    output.append("i1")
+                else:
+                    output.append("tensordesc")
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+        return output
+    def _serialize_signature(sig):
+        if isinstance(sig, tuple):
+            return ','.join(map(_serialize_signature, sig))
+        return sig
+    def _extracted_type(ty):
+        if isinstance(ty, tuple):
+            val = ','.join(map(_extracted_type, ty))
+            return f"[{val}]"
+        if ty.startswith("*") or ty.startswith("tensordesc"):
+            return "PyObject*"
+        if ty == "constexpr":
+            return "PyObject*"
+        return ty_to_cpp(ty)
+    def format_of(ty):
+        if isinstance(ty, tuple):
+            val = ''.join(map(format_of, ty))
+            return f"({val})"
+        if ty.startswith("*") or ty.startswith("tensordesc"):
+            return "O"
+        if ty == "constexpr":
+            return "O"
+        return {
+            "double": "d",
+            "long": "l",
+            "int8_t": "b",
+            "int16_t": "h",
+            "int32_t": "i",
+            "int64_t": "L",
+            "uint8_t": "B",
+            "uint16_t": "H",
+            "uint32_t": "I",
+            "uint64_t": "K",
+        }[ty_to_cpp(ty)]
+    signature = {idx: s for idx, s in enumerate(_expand_signature(signature.values()))}
+    args_format = ''.join([format_of(ty) for ty in signature.values()])
+    format = _BASE_ARGS_FORMAT + args_format
+    signature = ','.join(map(_serialize_signature, signature.values()))
+    signature = list(filter(bool, signature.split(',')))
+    signature = {i: s for i, s in enumerate(signature)}
+    args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+    # Record the end of regular arguments;
+    # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
+    internal_args_list = []
+    for i, ty in signature.items():
+        if ty.startswith("*"):
+            internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty.startswith("tensordesc"):
+            internal_args_list.append(f"*desc{i}")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
+        elif ty != "constexpr":
+            internal_args_list.append(f"_arg{i}")
+    newline = '\n  '
+    ptr_decls = [
+        f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;"
+        for i, ty in signature.items()
+        if ty.startswith("*")
+    ]
+    tensor_desc_decls = [
+        f"TDMDescriptor* desc{i} = getTDMDescriptor(_arg{i}, {i});" for i, ty in signature.items()
+        if ty.startswith("tensordesc")
+    ]
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
+    libhip_path = _get_path_to_hip_runtime_dylib()
+    # generate glue code
+    params = list(range(len(signature)))
+    params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
+    params.append("&global_scratch")
+    params.append("&profile_scratch")
+    src = f"""
+#define __HIP_PLATFORM_AMD__
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <Python.h>
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <dlfcn.h>
+typedef struct {{
+  uint32_t group0_0;
+  uint32_t group0_1;
+  uint32_t group0_2;
+  uint32_t group0_3;
+  uint32_t group1_0;
+  uint32_t group1_1;
+  uint32_t group1_2;
+  uint32_t group1_3;
+  uint32_t group1_4;
+  uint32_t group1_5;
+  uint32_t group1_6;
+  uint32_t group1_7;
+}} TDMDescriptor;
+typedef struct {{
+  PyObject_HEAD;
+  TDMDescriptor desc;
+}} PyTDMDescriptorObject;
+// The list of paths to search for the HIP runtime library. The caller Python
+// code should substitute the search path placeholder.
+static const char *hipLibSearchPaths[] = {{"{libhip_path}"}};
+// The list of HIP dynamic library symbols and their signature we are interested
+// in this file.
+#define HIP_SYMBOL_LIST(FOR_EACH_ERR_FN, FOR_EACH_STR_FN)                     \\
+  FOR_EACH_STR_FN(hipGetLastError, true)                                      \\
+  FOR_EACH_STR_FN(hipGetErrorString, true, hipError_t hipError)               \\
+  FOR_EACH_ERR_FN(hipDrvLaunchKernelEx, false,                                \\
+                  const HIP_LAUNCH_CONFIG *config,                            \\
+                  hipFunction_t f,                                            \\
+                  void **kernelParams,                                        \\
+                  void **extra)                                               \\
+  FOR_EACH_ERR_FN(hipModuleLaunchKernel, true, hipFunction_t f,               \\
+                  unsigned int gridDimX, unsigned int gridDimY,               \\
+                  unsigned int gridDimZ, unsigned int blockDimX,              \\
+                  unsigned int blockDimY, unsigned int blockDimZ,             \\
+                  unsigned int sharedMemBytes, hipStream_t stream,            \\
+                  void **kernelParams, void **extra)                          \\
+  FOR_EACH_ERR_FN(hipModuleLaunchCooperativeKernel, true, hipFunction_t f,    \\
+                  unsigned int gridDimX, unsigned int gridDimY,               \\
+                  unsigned int gridDimZ, unsigned int blockDimX,              \\
+                  unsigned int blockDimY, unsigned int blockDimZ,             \\
+                  unsigned int sharedMemBytes, hipStream_t stream,            \\
+                  void **kernelParams, void **extra)                          \\
+  FOR_EACH_ERR_FN(hipPointerGetAttribute, true, void *data,                   \\
+                  hipPointer_attribute attribute, hipDeviceptr_t ptr)
+// The HIP symbol table for holding resolved dynamic library symbols.
+struct HIPSymbolTable {{
+#define DEFINE_EACH_ERR_FIELD(hipSymbolName, required, ...)                   \\
+  hipError_t (*hipSymbolName)(__VA_ARGS__);
+#define DEFINE_EACH_STR_FIELD(hipSymbolName, required, ...)                   \\
+  const char *(*hipSymbolName)(__VA_ARGS__);
+  HIP_SYMBOL_LIST(DEFINE_EACH_ERR_FIELD, DEFINE_EACH_STR_FIELD)
+}};
+static struct HIPSymbolTable hipSymbolTable;
+bool initSymbolTable() {{
+  // Use the HIP runtime library loaded into the existing process if it exits.
+  void *lib = dlopen("libamdhip64.so", RTLD_NOLOAD);
+  // Otherwise, go through the list of search paths to dlopen the first HIP
+  // driver library.
+  if (!lib) {{
+    int n = sizeof(hipLibSearchPaths) / sizeof(hipLibSearchPaths[0]);
+    for (int i = 0; i < n; ++i) {{
+      void *handle = dlopen(hipLibSearchPaths[i], RTLD_LAZY | RTLD_LOCAL);
+      if (handle) {{
+        lib = handle;
+      }}
+    }}
+  }}
+  if (!lib) {{
+    PyErr_SetString(PyExc_RuntimeError, "cannot open libamdhip64.so");
+    return false;
+  }}
+  typedef hipError_t (*hipGetProcAddress_fn)(
+      const char *symbol, void **pfn, int hipVersion, uint64_t hipFlags,
+      hipDriverProcAddressQueryResult *symbolStatus);
+  hipGetProcAddress_fn hipGetProcAddress;
+  dlerror(); // Clear existing errors
+  const char *error = NULL;
+  *(void **)&hipGetProcAddress = dlsym(lib, "hipGetProcAddress");
+  error = dlerror();
+  if (error) {{
+    PyErr_SetString(PyExc_RuntimeError,
+                    "cannot query 'hipGetProcAddress' from libamdhip64.so");
+    dlclose(lib);
+    return false;
+  }}
+  // Resolve all symbols we are interested in.
+  int hipVersion = HIP_VERSION;
+  uint64_t hipFlags = 0;
+  hipDriverProcAddressQueryResult symbolStatus;
+  hipError_t status = hipSuccess;
+#define QUERY_EACH_FN(hipSymbolName, required, ...)                            \
+  status = hipGetProcAddress(#hipSymbolName,                                   \
+                             (void **)&hipSymbolTable.hipSymbolName,           \
+                             hipVersion, hipFlags, &symbolStatus);             \
+  if (required && status != hipSuccess) {{                                     \
+    PyErr_SetString(PyExc_RuntimeError,                                        \
+                    "cannot get address for '" #hipSymbolName                  \
+                    "' from libamdhip64.so");                                  \
+    dlclose(lib);                                                              \
+    return false;                                                              \
+  }}
+  HIP_SYMBOL_LIST(QUERY_EACH_FN, QUERY_EACH_FN)
+  return true;
+}}
+static inline void gpuAssert(hipError_t code, const char *file, int line)
+{{
+   if (code != HIP_SUCCESS)
+   {{
+      const char* prefix = "Triton Error [HIP]: ";
+      const char* str = hipSymbolTable.hipGetErrorString(code);
+      char err[1024] = {{0}};
+      snprintf(err, 1024, "%s Code: %d, Messsage: %s", prefix, code, str );
+      PyErr_SetString(PyExc_RuntimeError, err);
+   }}
+}}
+#define HIP_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int shared_memory, hipStream_t stream, hipFunction_t function, hipDeviceptr_t profile_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+  if (gridX * gridY * gridZ == 0)
+    return;
+  hipDeviceptr_t global_scratch = 0;
+  void *params[] = {{ {', '.join(params)} }};
+  if(num_ctas > 1) {{
+    if (!hipSymbolTable.hipDrvLaunchKernelEx) {{
+        PyErr_SetString(PyExc_RuntimeError, "missing hipDrvLaunchKernelEx symbol; please update HIP runtime");
+        return;
+    }}
+    hipLaunchAttribute attributes[2];
+    // Attribute0: Cluster dimensions
+    attributes[0].id = 4;
+    int *cluster_dims = (int*)attributes[0].val.pad;
+    cluster_dims[0] = num_ctas;
+    cluster_dims[1] = 1;
+    cluster_dims[2] = 1;
+    // Attribute1: Cooperative launch
+    attributes[1].id = hipLaunchAttributeCooperative;
+    attributes[1].val.cooperative = launch_cooperative_grid;
+    HIP_LAUNCH_CONFIG config = {{
+        gridX * num_ctas, gridY, gridZ, // Grid size
+        {warp_size} * num_warps, 1, 1, // Block size
+        shared_memory, stream,
+        attributes, 2 // Number of attributes
+    }};
+    HIP_CHECK(hipSymbolTable.hipDrvLaunchKernelEx(&config, function, params, 0));
+    return;
+  }}
+  else if (launch_cooperative_grid) {{
+    HIP_CHECK(hipSymbolTable.hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
+    return;
+  }}
+  else {{
+    HIP_CHECK(hipSymbolTable.hipModuleLaunchKernel(function, gridX, gridY, gridZ, {warp_size}*num_warps, 1, 1, shared_memory, stream, params, 0));
+  }}
+}}
+typedef struct _DevicePtrInfo {{
+    hipDeviceptr_t dev_ptr;
+    bool valid;
+}} DevicePtrInfo;
+static PyObject* data_ptr_str = NULL;
+static PyObject* py_tdm_descriptor_type = NULL;
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
+  DevicePtrInfo ptr_info;
+  hipError_t status = hipSuccess;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {{
+    ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(obj);
+    return ptr_info;
+  }}
+  if (obj == Py_None) {{
+    // valid nullptr
+    return ptr_info;
+  }}
+  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
+  if (!ret) {{
+    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  if (!PyLong_Check(ret)) {{
+    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  ptr_info.dev_ptr = (hipDeviceptr_t)PyLong_AsUnsignedLongLong(ret);
+  if (!ptr_info.dev_ptr)
+    goto cleanup;
+  uint64_t dev_ptr;
+  status = hipSymbolTable.hipPointerGetAttribute(&dev_ptr, HIP_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
+  if (status == hipErrorInvalidValue) {{
+      PyErr_Format(PyExc_ValueError,
+                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+      ptr_info.valid = false;
+      // Clear and ignore HIP error
+      (void)hipSymbolTable.hipGetLastError();
+  }}
+  ptr_info.dev_ptr = (hipDeviceptr_t)dev_ptr;
+cleanup:
+  Py_DECREF(ret);
+  return ptr_info;
+}}
+static inline TDMDescriptor* getTDMDescriptor(PyObject* obj, int idx) {{
+  if (Py_TYPE(obj) != (PyTypeObject*)py_tdm_descriptor_type) {{
+    PyErr_Format(PyExc_TypeError, "object must be of type PyTDMDescriptor, got %s", Py_TYPE(obj)->tp_name);
+    return NULL;
+  }}
+  TDMDescriptor* desc = &((PyTDMDescriptorObject*)obj)->desc;
+  return desc;
+}}
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat/blob/5e317108f872c904eb726cb8d560dcadbdf88a72/pythoncapi_compat.h#L482-L492
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#else
+    PyFloat_Pack2(f, (char*)&result, 1);
+#endif
+    return result;
+}}
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
+static PyObject* launch(PyObject* self, PyObject* args) {{
+  int gridX, gridY, gridZ;
+  uint64_t _stream;
+  uint64_t _function;
+  int launch_cooperative_grid;
+  PyObject *profile_scratch_obj = NULL;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  PyObject *kernel_metadata = NULL;
+  PyObject *launch_metadata = NULL;
+  {' '.join([f"{_extracted_type(ty)} _arg{i}; " for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(args, \"{format}\", &launch_cooperative_grid,
+                                           &gridX, &gridY, &gridZ, &_stream, &_function, &profile_scratch_obj,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook {args_list})) {{
+    return NULL;
+  }}
+  // extract kernel metadata
+  int num_warps, num_ctas, shared_memory;
+  if (!PyArg_ParseTuple(kernel_metadata, \"iii\", &num_warps, &num_ctas, &shared_memory)) {{
+    return NULL;
+  }}
+  // extract launch metadata
+  if (launch_enter_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+  hipDeviceptr_t profile_scratch = 0;
+  if (profile_scratch_obj != Py_None) {{
+    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
+    if (!profile_scratch_info.valid) {{
+      return NULL;
+    }}
+    profile_scratch = profile_scratch_info.dev_ptr;
+  }}
+  // raise exception asap
+  {newline.join(tensor_desc_decls)}
+  {newline.join(ptr_decls)}
+  {newline.join(float_storage_decls)}
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, shared_memory, (hipStream_t)_stream, (hipFunction_t)_function, (hipDeviceptr_t)profile_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+  if(launch_exit_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+  if(PyErr_Occurred()) {{
+    return NULL;
+  }}
+  Py_RETURN_NONE;
+}}
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  if (!initSymbolTable()) {{
+    return NULL;
+  }}
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  data_ptr_str = PyUnicode_InternFromString("data_ptr");
+  if(data_ptr_str == NULL) {{
+    return NULL;
+  }}
+  PyObject* driver_mod = PyImport_ImportModule("triton.backends.amd.driver");
+  if (driver_mod == NULL) {{
+    return NULL;
+  }}
+  py_tdm_descriptor_type = PyObject_GetAttrString(driver_mod, "PyTDMDescriptor");
+  if (py_tdm_descriptor_type == NULL) {{
+    return NULL;
+  }}
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}}
+"""
+    return src
+def make_tensordesc_arg(arg, kernel_metadata, tensordesc_metadata):
+    """
+    Translate a tensor descriptor argument into the appropriate list of kernel
+    arguments. If `tensordesc_metadata` is provided, we will create a
+    TDMDescriptor object. Otherwise, we decompose the tensor descriptor into
+    base pointer, shape, strides, and padding flag. In both cases, we append the
+    shape and strides at the end to match the expected kernel signature.
+    """
+    if tensordesc_metadata is None:
+        # Currently the host side tensor descriptors get decomposed in
+        # the frontend to tensor desc, shape, and strides. We have no
+        # way to use these shape and strides when processing tensor
+        # descriptors which is why we provide our own decomposition
+        # above. Sadly this means we have to pass the shape and strides
+        # twice.
+        return [arg.base, *arg.shape, *arg.strides, arg.padding == "nan", *arg.shape, *arg.strides]
+    shape = arg.shape
+    strides = arg.strides
+    base = arg.base.data_ptr()
+    assert "elem_bits" in tensordesc_metadata and "block_size" in tensordesc_metadata
+    elem_bits = tensordesc_metadata["elem_bits"]
+    block_size = tensordesc_metadata["block_size"]
+    pad_interval, pad_amount = 0, 0
+    interval_padding_pairs = tensordesc_metadata.get("interval_padding_pairs", [])
+    if interval_padding_pairs:
+        assert len(interval_padding_pairs) == 1 and len(interval_padding_pairs[0]) == 2
+        pad_interval, pad_amount = interval_padding_pairs[0]
+    num_warps = kernel_metadata[0]
+    driver = triton.runtime.driver.active
+    assert isinstance(driver, HIPDriver)
+    desc = driver.utils.create_tdm_descriptor(elem_bits, block_size, num_warps, pad_interval, pad_amount, shape,
+                                              strides, base)
+    return [desc, *shape, *strides]
+def wrap_handle_tensordesc(launcher, signature, tensordesc_metadata):
+    """
+    Wrap a kernel launcher function to handle tensor descriptor arguments.
+    Use the provided `tensordesc_metadata` to determine whether to create
+    TDMDescriptor objects or decompose the tensor descriptors.
+    Args:
+        launcher (callable): The original kernel launcher function.
+        signature (Dict[int, str]): The kernel signature mapping argument indices to types.
+        tensordesc_metadata (List[Dict] or None): The list of tensor descriptor metadata, following the order
+                                                  of tensor descriptor arguments. If None, decompose tensor descriptors.
+    Returns:
+        launcher (callable): The wrapped kernel launcher function.
+    """
+    has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+    if not has_tensor_desc_arg:
+        return launcher
+    tensordesc_indices = set(
+        [i for i, sig in enumerate(signature.values()) if isinstance(sig, str) and sig.startswith("tensordesc")])
+    assert not tensordesc_metadata or len(tensordesc_metadata) == len(tensordesc_indices)
+    if not tensordesc_metadata:
+        tensordesc_metadata = [None] * len(tensordesc_indices)
+    def inner(*args):
+        meta_args = args[:len(_BASE_ARGS_FORMAT)]
+        raw_kernel_args = args[len(_BASE_ARGS_FORMAT):]
+        final_args = []
+        tensordesc_idx = 0
+        for i, arg in enumerate(raw_kernel_args):
+            if i in tensordesc_indices:
+                tensordesc_args = make_tensordesc_arg(arg, meta_args[7],  # kernel_metadata
+                                                      tensordesc_metadata[tensordesc_idx])
+                final_args.extend(tensordesc_args)
+                tensordesc_idx += 1
+            else:
+                final_args.append(arg)
+        return launcher(*meta_args, *final_args)
+    return inner
+class HIPLauncher(object):
+    def __init__(self, src, metadata):
+        constants = src.constants if hasattr(src, "constants") else dict()
+        arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
+        constants = {arg_idx(idx): value for idx, value in constants.items()}
+        signature = {idx: value for idx, value in src.signature.items()}
+        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
+        src = make_launcher(constants, signature, metadata.warp_size, tensordesc_meta)
+        mod = compile_module_from_src(src=src, name="__triton_launcher", include_dirs=include_dirs)
+        self.launch = wrap_handle_tensordesc(mod.launch, signature, tensordesc_meta)
+        self.launch_cooperative_grid = metadata.launch_cooperative_grid
+        self.profile_scratch_size = metadata.profile_scratch_size
+        self.profile_scratch_align = metadata.profile_scratch_align
+    def __call__(self, gridX, gridY, gridZ, stream, function, *args):
+        def allocate_scratch(size, align, allocator):
+            if size > 0:
+                grid_size = gridX * gridY * gridZ
+                alloc_size = grid_size * size
+                alloc_fn = allocator.get()
+                return alloc_fn(alloc_size, align, stream)
+            return None
+        profile_scratch = allocate_scratch(self.profile_scratch_size, self.profile_scratch_align,
+                                           _allocation._profile_allocator)
+        self.launch(self.launch_cooperative_grid, gridX, gridY, gridZ, stream, function, profile_scratch, *args)
+class HIPDriver(GPUDriver):
+    def __init__(self):
+        super().__init__()
+        self.utils = HIPUtils()
+        self.launcher_cls = HIPLauncher
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+    @staticmethod
+    def is_active():
+        try:
+            import torch
+            return torch.cuda.is_available() and (torch.version.hip is not None)
+        except ImportError:
+            return False
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        return ty_to_cpp(ty)
+    def get_current_target(self):
+        device = self.get_current_device()
+        device_properties = self.utils.get_device_properties(device)
+        arch = knobs.runtime.override_arch or device_properties['arch']
+        warp_size = device_properties['warpSize']
+        return GPUTarget("hip", arch.split(':')[0], warp_size)
+    def get_active_torch_device(self):
+        import torch
+        # when using hip devices, the device string in pytorch is "cuda"
+        return torch.device("cuda", self.get_current_device())
+    def get_benchmarker(self):
+        from triton.testing import do_bench
+        return do_bench
+    def get_empty_cache_for_benchmark(self):
+        import torch
+        # It's the same as the Nvidia backend.
+        cache_size = 256 * 1024 * 1024
+        return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')
+    def clear_cache(self, cache):
+        cache.zero_()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/compiler.py ADDED Viewed

	@@ -0,0 +1,553 @@

+from triton.backends.compiler import BaseBackend, GPUTarget, Language
+from triton._C.libtriton import ir, passes, llvm, nvidia
+from triton import knobs
+from triton.runtime.errors import PTXASError
+from dataclasses import dataclass
+import functools
+from typing import Any, Dict, Tuple, Optional
+from types import ModuleType
+import hashlib
+import re
+import tempfile
+import signal
+import os
+import subprocess
+from pathlib import Path
+def min_dot_size(target: GPUTarget):
+    def check_dot_compatibility(lhs_type, rhs_type) -> Tuple[int, int, int]:  # [m, n, k]
+        lhs_bitwidth = lhs_type.scalar.primitive_bitwidth
+        rhs_bitwidth = rhs_type.scalar.primitive_bitwidth
+        assert lhs_bitwidth == rhs_bitwidth, "lhs and rhs bitwidth must be the same"
+        # For small M/N the input we can still use tensorcores with padding.
+        if lhs_bitwidth == 8:
+            return (1, 1, 32)
+        else:
+            return (1, 1, 16)
+    return check_dot_compatibility
+def get_ptxas(arch: int) -> knobs.NvidiaTool:
+    return knobs.nvidia.ptxas_blackwell if arch >= 100 else knobs.nvidia.ptxas
+@functools.lru_cache()
+def get_ptxas_version(arch: int = 80):
+    mock_ver = knobs.nvidia.mock_ptx_version
+    if mock_ver is not None:
+        return mock_ver  # This is not really a version of ptxas, but it is good enough for testing
+    version = subprocess.check_output([get_ptxas(arch).path, "--version"]).decode("utf-8")
+    return version
+@functools.lru_cache()
+def ptx_get_version(cuda_version) -> int:
+    '''
+    Get the highest PTX version supported by the current CUDA driver.
+    '''
+    assert isinstance(cuda_version, str)
+    major, minor = map(int, cuda_version.split('.'))
+    if major == 12:
+        if minor < 6:
+            return 80 + minor
+        else:
+            return 80 + minor - 1
+    if major == 11:
+        return 70 + minor
+    if major == 10:
+        return 63 + minor
+    if major >= 13:
+        base_ptx = 90
+        return base_ptx + (major - 13) * 10 + minor
+    raise RuntimeError("Triton only support CUDA 10.0 or higher, but got CUDA version: " + cuda_version)
+def get_ptx_version_from_options(options, arch: int):
+    ptx_version = options.ptx_version
+    if ptx_version is None:
+        cuda_version = get_ptxas(arch).version
+        ptx_version = ptx_get_version(cuda_version)
+    return ptx_version
+@functools.lru_cache()
+def get_features(options, arch: int):
+    ptx_version = get_ptx_version_from_options(options, arch)
+    # PTX 8.6 is the max version supported by llvm c1188642.
+    #
+    # To check if a newer PTX version is supported, increase this value
+    # and run a test.  If it's not supported, LLVM will print a warning
+    # like "+ptx8.4 is not a recognized feature for this target".
+    llvm_ptx_version = min(86, ptx_version)
+    features = f'+ptx{llvm_ptx_version}'
+    return features
+@functools.lru_cache(None)
+def file_hash(path):
+    with open(path, "rb") as f:
+        return hashlib.sha256(f.read()).hexdigest()
+def sm_arch_from_capability(capability: int):
+    # TODO: Handle non-"a" sms
+    suffix = "a" if capability >= 90 else ""
+    return f"sm_{capability}{suffix}"
+@dataclass(frozen=True)
+class CUDAOptions:
+    num_warps: int = 4
+    num_ctas: int = 1
+    num_stages: int = 3
+    warp_size: int = 32
+    # maxnreg corresponds to the ptx parameter .maxnreg, which controls the
+    # maximum number of 32-bit registers used by one thread.
+    maxnreg: Optional[int] = None
+    ptx_version: int = None
+    ptx_options: Optional[str] = knobs.nvidia.ptxas_options
+    ir_override: Optional[str] = None  # filename of a user-defined IR (*.{ttir|ttgir|llir|ptx})
+    enable_fp_fusion: bool = True
+    enable_reflect_ftz: bool = True  # ftz in libdevice
+    launch_cooperative_grid: bool = False
+    launch_pdl: bool = False
+    supported_fp8_dtypes: Tuple[str] = ("fp8e5", "fp8e4b15")
+    deprecated_fp8_dot_operand_dtypes: Tuple[str] = ()
+    default_dot_input_precision: str = "tf32"
+    allowed_dot_input_precisions: Tuple[str] = ("tf32", "tf32x3", "ieee", 'bf16x3', 'bf16x6')
+    max_num_imprecise_acc_default: bool = None
+    extern_libs: dict = None
+    debug: bool = False
+    backend_name: str = 'cuda'
+    sanitize_overflow: bool = True
+    arch: str = None
+    instrumentation_mode: str = ""
+    def __post_init__(self):
+        default_libdir = Path(__file__).parent / 'lib'
+        extern_libs = {} if self.extern_libs is None else dict(self.extern_libs)
+        if not extern_libs.get('libdevice', None):
+            extern_libs['libdevice'] = knobs.nvidia.libdevice_path or str(default_libdir / 'libdevice.10.bc')
+        object.__setattr__(self, 'extern_libs', tuple(extern_libs.items()))
+        assert self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0, \
+               "num_warps must be a power of 2"
+    def hash(self):
+        hash_dict = dict(self.__dict__)
+        hash_dict["extern_libs"] = tuple((k, file_hash(v)) for k, v in sorted(hash_dict["extern_libs"]))
+        key = "_".join([f"{name}-{val}" for name, val in sorted(hash_dict.items())])
+        return hashlib.sha256(key.encode("utf-8")).hexdigest()
+class CUDABackend(BaseBackend):
+    instrumentation = None
+    @staticmethod
+    def supports_target(target: GPUTarget):
+        return target.backend == 'cuda'
+    def _parse_arch(self, arch):
+        pattern = r"^sm(\d+)$"
+        match = re.fullmatch(pattern, arch)
+        if not match:
+            raise ValueError(f"TRITON_OVERRIDE_ARCH must have the form {pattern}")
+        return int(match.group(1))
+    def get_target_name(self, options) -> str:
+        capability = self._parse_arch(options.arch)
+        return f"cuda:{capability}"
+    def __init__(self, target: GPUTarget) -> None:
+        super().__init__(target)
+        self.binary_ext = "cubin"
+    def parse_options(self, opts) -> Any:
+        # Enable debug mode for ConSan, so device-side assertions are not optimized out
+        if "instrumentation_mode" in opts and opts["instrumentation_mode"] == "consan":
+            opts["debug"] = True
+        args = {'arch': knobs.runtime.override_arch or f"sm{self.target.arch}"}
+        args.update({k: opts[k] for k in CUDAOptions.__dataclass_fields__.keys() if k in opts if opts[k] is not None})
+        capability = int(self._parse_arch(args["arch"]))
+        if args.get("num_ctas", 1) > 1 and capability < 90:
+            raise ValueError((f"num_ctas > 1 requires NVIDIA SM90+ (Hopper). "
+                              f"Current target is sm_{capability}. This configuration will fail. "
+                              f"Please set num_ctas=1 or target an SM90+ GPU."))
+        if "supported_fp8_dtypes" not in args:
+            supported_fp8_dtypes = set(CUDAOptions.supported_fp8_dtypes)
+            if capability >= 89:
+                supported_fp8_dtypes.add("fp8e4nv")
+            args["supported_fp8_dtypes"] = tuple(sorted(supported_fp8_dtypes))
+        if "deprecated_fp8_dot_operand_dtypes" not in args:
+            if capability >= 90:
+                args["deprecated_fp8_dot_operand_dtypes"] = ("fp8e4b15", )
+        if "enable_fp_fusion" not in args:
+            args["enable_fp_fusion"] = knobs.language.default_fp_fusion
+        args["max_num_imprecise_acc_default"] = 2**30 if capability == 90 else 0
+        return CUDAOptions(**args)
+    def pack_metadata(self, metadata):
+        return (
+            metadata.num_warps,
+            metadata.num_ctas,
+            metadata.shared,
+        )
+    def get_codegen_implementation(self, options):
+        import triton.language.extra.cuda as cuda
+        capability = int(self._parse_arch(options.arch))
+        codegen_fns = {
+            "convert_custom_types":
+            cuda.convert_custom_float8_sm80 if capability >= 80 else cuda.convert_custom_float8_sm70, "min_dot_size":
+            min_dot_size(self.target)
+        }
+        return codegen_fns
+    def get_module_map(self) -> Dict[str, ModuleType]:
+        from triton.language.extra.cuda import libdevice
+        return {"triton.language.extra.libdevice": libdevice}
+    def load_dialects(self, ctx):
+        nvidia.load_dialects(ctx)
+        if CUDABackend.instrumentation:
+            CUDABackend.instrumentation.load_dialects(ctx)
+    @staticmethod
+    def make_ttir(mod, metadata, opt, capability):
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.common.add_inliner(pm)
+        passes.ttir.add_rewrite_tensor_pointer(pm)
+        if capability // 10 < 9:
+            passes.ttir.add_rewrite_tensor_descriptor_to_pointer(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_combine(pm)
+        passes.ttir.add_reorder_broadcast(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.ttir.add_loop_unroll(pm)
+        pm.run(mod, 'make_ttir')
+        return mod
+    @staticmethod
+    def make_ttgir(mod, metadata, opt, capability):
+        # Set maxnreg on all kernels, if it was provided.
+        if opt.maxnreg is not None:
+            mod.set_attr("ttg.maxnreg", ir.builder(mod.context).get_int32_attr(opt.maxnreg))
+        pm = ir.pass_manager(mod.context)
+        dump_enabled = pm.enable_debug()
+        emuTF32 = (capability // 10 >= 8)
+        passes.ttir.add_convert_to_ttgpuir(pm, f"cuda:{capability}", opt.num_warps, 32, opt.num_ctas)
+        # optimize TTGIR
+        passes.ttgpuir.add_coalesce(pm)
+        passes.ttgpuir.add_f32_dot_tc(pm, emuTF32)
+        # TODO(Qingyi): Move PlanCTAPass to the front of CoalescePass
+        nvidia.passes.ttnvgpuir.add_plan_cta(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_thread_locality(pm)
+        passes.ttgpuir.add_accelerate_matmul(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
+        nvidia.passes.ttnvgpuir.add_optimize_descriptor_encoding(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        if capability // 10 in [8, 9]:
+            passes.ttgpuir.add_fuse_nested_loops(pm)
+            passes.common.add_canonicalizer(pm)
+            passes.ttir.add_triton_licm(pm)
+            passes.common.add_canonicalizer(pm)
+            passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+            nvidia.passes.hopper.add_hopper_warpspec(pm, opt.num_stages, dump_enabled)
+            passes.ttgpuir.add_assign_latencies(pm, opt.num_stages)
+            passes.ttgpuir.add_schedule_loops(pm)
+            passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
+        elif capability // 10 >= 10:
+            passes.ttgpuir.add_fuse_nested_loops(pm)
+            passes.common.add_canonicalizer(pm)
+            passes.ttir.add_triton_licm(pm)
+            passes.ttgpuir.add_optimize_accumulator_init(pm)
+            passes.ttgpuir.add_hoist_tmem_alloc(pm, False)
+            nvidia.passes.ttnvgpuir.add_promote_lhs_to_tmem(pm)
+            passes.ttgpuir.add_assign_latencies(pm, opt.num_stages)
+            passes.ttgpuir.add_schedule_loops(pm)
+            passes.ttgpuir.add_warp_specialize(pm, opt.num_stages)
+            passes.ttgpuir.add_pipeline(pm, opt.num_stages, dump_enabled)
+            passes.ttgpuir.add_optimize_partition_warps(pm)
+            passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+            # hoist again and allow hoisting out of if statements
+            passes.ttgpuir.add_hoist_tmem_alloc(pm, True)
+            nvidia.passes.ttnvgpuir.add_remove_tmem_tokens(pm)
+        else:
+            passes.ttir.add_triton_licm(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.ttgpuir.add_prefetch(pm)
+        passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80)
+        passes.ttgpuir.add_coalesce_async_copy(pm)
+        nvidia.passes.ttnvgpuir.add_optimize_tmem_layouts(pm)
+        if capability // 10 >= 9:
+            nvidia.passes.ttnvgpuir.add_tma_lowering(pm)
+        passes.ttgpuir.add_remove_layout_conversions(pm)
+        nvidia.passes.ttnvgpuir.add_interleave_tmem(pm)
+        passes.ttgpuir.add_reduce_data_duplication(pm)
+        passes.ttgpuir.add_reorder_instructions(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        nvidia.passes.ttnvgpuir.add_fence_insertion(pm, capability)
+        nvidia.passes.ttnvgpuir.add_lower_mma(pm)
+        passes.common.add_sccp(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_canonicalizer(pm)
+        pm.run(mod, 'make_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+    def gluon_to_ttgir(self, src, metadata, options, capability):
+        mod = src
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.gluon.add_inliner(pm)
+        passes.gluon.add_infer_coalesced_encodings(pm)
+        passes.gluon.add_resolve_auto_encodings(pm)
+        nvidia.passes.ttnvgpuir.add_tma_lowering(pm)
+        passes.gluon.add_canonicalizer(pm)
+        passes.common.add_sccp(pm)
+        passes.ttir.add_loop_aware_cse(pm)
+        passes.gluon.add_canonicalizer(pm)
+        passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+        pm.run(mod, 'gluon_to_ttgir')
+        metadata["tensordesc_meta"] = mod.get_tensordesc_metadata()
+        return mod
+    def make_llir(self, src, metadata, options, capability):
+        ptx_version = get_ptx_version_from_options(options, self.target.arch)
+        mod = src
+        # TritonGPU -> LLVM-IR (MLIR)
+        pm = ir.pass_manager(mod.context)
+        pm.enable_debug()
+        passes.ttgpuir.add_combine_tensor_select_and_if(pm)
+        passes.ttgpuir.add_allocate_warp_groups(pm)
+        passes.convert.add_scf_to_cf(pm)
+        passes.gluon.add_inliner(pm)
+        nvidia.passes.ttgpuir.add_allocate_shared_memory_nv(pm, capability, ptx_version)
+        nvidia.passes.ttnvgpuir.add_allocate_tensor_memory(pm)
+        nvidia.passes.ttnvgpuir.add_check_matmul_two_cta(pm)
+        if knobs.compilation.instrumentation_mode == "consan":
+            # Call ConcurrencySanitizerPass here, before allocating global scratch memory but after allocating tensor and shared
+            passes.ttgpuir.add_concurrency_sanitizer(pm)
+        passes.ttgpuir.add_allocate_global_scratch_memory(pm)
+        nvidia.passes.ttnvgpuir.add_proxy_fence_insertion(pm, capability)
+        # instrumentation point here so we can override IRs above (e.g., ttir and ttgir)
+        if CUDABackend.instrumentation:
+            CUDABackend.instrumentation.patch("ttgpuir_to_llvmir", pm, mod.context)
+        nvidia.passes.ttgpuir.add_to_llvmir(pm, capability, ptx_version)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        nvidia.passes.ttnvgpuir.add_nvgpu_to_llvm(pm)
+        nvidia.passes.ttnvgpuir.add_warp_specialize_to_llvm(pm)
+        passes.common.add_canonicalizer(pm)
+        passes.common.add_cse(pm)
+        passes.common.add_symbol_dce(pm)
+        passes.convert.add_nvvm_to_llvm(pm)
+        if not knobs.compilation.disable_line_info and not knobs.compilation.dump_ir_extract_di_local_variables:
+            passes.llvmir.add_di_scope(pm)
+        if CUDABackend.instrumentation:
+            CUDABackend.instrumentation.patch("llvmir_to_llvm", pm, mod.context)
+        pm.run(mod, 'make_llir')
+        if knobs.compilation.dump_ir_extract_di_local_variables:
+            # comments below on why separate it
+            if not knobs.compilation.disable_line_info:
+                pm = ir.pass_manager(mod.context)
+                pm.enable_debug()
+                passes.llvmir.add_di_scope(pm)
+                pm.run(mod, 'make_llir.disable_line_info')
+            # insert dbg intrinsic with several DI Attribute including source
+            # var name and type info note: unknown reason for now, but this
+            # pass and add_di_scope has to be run separately, otherwise if we
+            # put them into previous pipline, it trigger a segmentfault without
+            # any error message; could be due to a bug in mlir or pybind11
+            pm = ir.pass_manager(mod.context)
+            pm.enable_debug()
+            passes.llvmir.add_di_local_variable(pm)
+            pm.run(mod, 'make_llir.dump_ir_extract_di_local_variables')
+        # LLVM-IR (MLIR) -> LLVM-IR (LLVM)
+        llvm.init_targets()
+        context = llvm.context()
+        if knobs.compilation.enable_asan:
+            raise RuntimeError(
+                "Address Sanitizer Error: Address sanitizer is currently only supported on the AMD backend")
+        llvm_mod = llvm.to_module(mod, context)
+        proc = sm_arch_from_capability(capability)
+        features = get_features(options, self.target.arch)
+        triple = 'nvptx64-nvidia-cuda'
+        nvidia.set_short_ptr()
+        llvm.attach_datalayout(llvm_mod, triple, proc, features)
+        if options.enable_reflect_ftz:
+            nvidia.set_nvvm_reflect_ftz(llvm_mod)
+        if options.extern_libs and nvidia.has_extern_deps(llvm_mod):
+            paths = [path for (name, path) in options.extern_libs]
+            llvm.link_extern_libs(llvm_mod, paths)
+        llvm.optimize_module(llvm_mod, llvm.OPTIMIZE_O3)
+        # Get some metadata
+        # warp-specialization mutates num_warps
+        total_num_warps = src.get_int_attr("ttg.total-num-warps")
+        if total_num_warps is not None:
+            metadata["num_warps"] = total_num_warps
+        metadata["shared"] = src.get_int_attr("ttg.shared")
+        metadata["tmem_size"] = src.get_int_attr("ttg.tensor_memory_size")
+        metadata["global_scratch_size"] = src.get_int_attr("ttg.global_scratch_memory_size")
+        metadata["global_scratch_align"] = src.get_int_attr("ttg.global_scratch_memory_alignment")
+        metadata["profile_scratch_size"] = src.get_int_attr("ttg.profile_scratch_memory_size") or 0
+        metadata["profile_scratch_align"] = src.get_int_attr("ttg.profile_scratch_memory_alignment") or 1
+        ret = str(llvm_mod)
+        del llvm_mod
+        del context
+        return ret
+    def make_ptx(self, src, metadata, opt, capability):
+        ptx_version = get_ptx_version_from_options(opt, self.target.arch)
+        triple = 'nvptx64-nvidia-cuda'
+        proc = sm_arch_from_capability(capability)
+        features = get_features(opt, self.target.arch)
+        flags = ["nvptx-mad-wide-opt"]
+        ret = llvm.translate_to_asm(src, triple, proc, features, flags, opt.enable_fp_fusion, False)
+        # Find kernel names (there should only be one)
+        names = re.findall(r".visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)", ret)
+        assert len(names) == 1
+        metadata["name"] = names[0]
+        # post-process
+        ptx_version = f'{ptx_version//10}.{ptx_version%10}'
+        ret = re.sub(r'\.version \d+\.\d+', f'.version {ptx_version}', ret, flags=re.MULTILINE)
+        ret = re.sub(r'\.target sm_\d+', f'.target sm_{capability}', ret, flags=re.MULTILINE)
+        if not knobs.compilation.dump_ir_extract_di_local_variables:
+            # Remove the debug flag that prevents ptxas from optimizing the code
+            # Note: if this flag is removed, the source var name and type info will be lost when ptx was compiled into cubin
+            #           and we may not be able to see them in cuda-gdb
+            ret = re.sub(r",\s*debug|debug,\s*", "", ret)
+        if knobs.nvidia.dump_nvptx:
+            print("// -----// NVPTX Dump //----- //")
+            print(ret)
+        return ret
+    def make_cubin(self, src, metadata, opt, capability):
+        ptxas = get_ptxas(self.target.arch).path
+        with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.ptx') as fsrc, \
+            tempfile.NamedTemporaryFile(delete=False, mode='r', suffix='.log') as flog:
+            fsrc.write(src)
+            fsrc.flush()
+            fbin = fsrc.name + '.o'
+            debug_info = []
+            if knobs.compilation.disable_line_info:
+                # This option is ignored if used without -lineinfo
+                debug_info += ["-lineinfo", "-suppress-debug-info"]
+            elif knobs.nvidia.disable_ptxas_opt:
+                # Synthesize complete debug info
+                debug_info += ["-g"]
+            else:
+                # Only emit line info
+                debug_info += ["-lineinfo"]
+            fmad = [] if opt.enable_fp_fusion else ["--fmad=false"]
+            arch = sm_arch_from_capability(capability)
+            # Disable ptxas optimizations if requested
+            disable_opt = ['--opt-level', '0'] if knobs.nvidia.disable_ptxas_opt else []
+            # Accept more ptxas options if provided
+            ptx_extra_options = opt.ptx_options.split(" ") if opt.ptx_options else []
+            ptxas_cmd = [
+                ptxas, *debug_info, *fmad, '-v', *disable_opt, *ptx_extra_options, f'--gpu-name={arch}', fsrc.name,
+                '-o', fbin
+            ]
+            try:
+                subprocess.run(ptxas_cmd, check=True, close_fds=False, stderr=flog)
+                if knobs.nvidia.dump_ptxas_log:
+                    with open(flog.name) as log_file:
+                        print(log_file.read())
+                if os.path.exists(fsrc.name):
+                    os.remove(fsrc.name)
+                if os.path.exists(flog.name):
+                    os.remove(flog.name)
+            except subprocess.CalledProcessError as e:
+                with open(flog.name) as log_file:
+                    log = log_file.read()
+                if os.path.exists(flog.name):
+                    os.remove(flog.name)
+                if e.returncode == 255:
+                    error = 'Internal Triton PTX codegen error'
+                elif e.returncode == 128 + signal.SIGSEGV:
+                    error = '`ptxas` raised SIGSEGV'
+                else:
+                    error = f'`ptxas` failed with error code {e.returncode}'
+                error = (f"{error}\n"
+                         f"`ptxas` stderr:\n{log}\n"
+                         f'Repro command: {" ".join(ptxas_cmd)}\n')
+                print(f"""
+================================================================
+{error}
+{src}
+================================================================
+please share the reproducer above with Triton project.
+""")
+                raise PTXASError(error)
+            with open(fbin, 'rb') as f:
+                cubin = f.read()
+            if os.path.exists(fbin):
+                os.remove(fbin)
+        return cubin
+    def add_stages(self, stages, options, language):
+        capability = self._parse_arch(options.arch)
+        if language == Language.TRITON:
+            stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options, capability)
+            stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, capability)
+        elif language == Language.GLUON:
+            stages["ttgir"] = lambda src, metadata: self.gluon_to_ttgir(src, metadata, options, capability)
+        stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options, capability)
+        stages["ptx"] = lambda src, metadata: self.make_ptx(src, metadata, options, self.target.arch)
+        stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, self.target.arch)
+        if knobs.runtime.add_stages_inspection_hook is not None:
+            knobs.runtime.add_stages_inspection_hook(self, stages, options, language, capability)
+    @functools.lru_cache()
+    def hash(self):
+        version = get_ptxas_version(self.target.arch)
+        return f'{version}-{self.target.arch}'

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.c ADDED Viewed

	@@ -0,0 +1,518 @@

+#include "cuda.h"
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+typedef struct {
+  PyObject_HEAD;
+  _Alignas(128) CUtensorMap tensorMap;
+} PyCUtensorMapObject;
+// Raises a Python exception and returns false if code is not CUDA_SUCCESS.
+static bool gpuAssert(CUresult code, const char *file, int line) {
+  if (code == CUDA_SUCCESS)
+    return true;
+  const char *prefix = "Triton Error [CUDA]: ";
+  const char *str;
+  cuGetErrorString(code, &str);
+  char err[1024] = {0};
+  strcat(err, prefix);
+  strcat(err, str);
+  PyGILState_STATE gil_state;
+  gil_state = PyGILState_Ensure();
+  PyErr_SetString(PyExc_RuntimeError, err);
+  PyGILState_Release(gil_state);
+  return false;
+}
+// To be used only *outside* a Py_{BEGIN,END}_ALLOW_THREADS block.
+#define CUDA_CHECK_AND_RETURN_NULL(ans)                                        \
+  do {                                                                         \
+    if (!gpuAssert((ans), __FILE__, __LINE__))                                 \
+      goto cleanup;                                                            \
+  } while (0)
+// To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
+#define CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(ans)                          \
+  do {                                                                         \
+    if (!gpuAssert((ans), __FILE__, __LINE__)) {                               \
+      PyEval_RestoreThread(_save);                                             \
+      return NULL;                                                             \
+    }                                                                          \
+  } while (0)
+// Used to check if functions exist in old CUDA driver versions.
+#define INITIALIZE_FUNCTION_POINTER_IF_NULL(funcPointer, initializerFunction)  \
+  do {                                                                         \
+    if ((funcPointer) == NULL) {                                               \
+      (funcPointer) = (initializerFunction)();                                 \
+      if ((funcPointer) == NULL) {                                             \
+        goto cleanup;                                                          \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+static PyObject *getDeviceProperties(PyObject *self, PyObject *args) {
+  int device_id;
+  if (!PyArg_ParseTuple(args, "i", &device_id))
+    return NULL;
+  // Get device handle
+  CUdevice device;
+  cuDeviceGet(&device, device_id);
+  // create a struct to hold device properties
+  int max_shared_mem;
+  int max_num_regs;
+  int multiprocessor_count;
+  int warp_size;
+  int sm_clock_rate;
+  int mem_clock_rate;
+  int mem_bus_width;
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &max_shared_mem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &max_num_regs, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &multiprocessor_count, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device));
+  CUDA_CHECK_AND_RETURN_NULL(
+      cuDeviceGetAttribute(&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &sm_clock_rate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &mem_clock_rate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
+  CUDA_CHECK_AND_RETURN_NULL(cuDeviceGetAttribute(
+      &mem_bus_width, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));
+  return Py_BuildValue("{s:i, s:i, s:i, s:i, s:i, s:i, s:i}", "max_shared_mem",
+                       max_shared_mem, "max_num_regs", max_num_regs,
+                       "multiprocessor_count", multiprocessor_count, "warpSize",
+                       warp_size, "sm_clock_rate", sm_clock_rate,
+                       "mem_clock_rate", mem_clock_rate, "mem_bus_width",
+                       mem_bus_width);
+cleanup:
+  return NULL;
+}
+static PyObject *loadBinary(PyObject *self, PyObject *args) {
+  const char *name;
+  const char *data;
+  Py_ssize_t data_size;
+  int shared;
+  int device;
+  if (!PyArg_ParseTuple(args, "ss#ii", &name, &data, &data_size, &shared,
+                        &device)) {
+    return NULL;
+  }
+  CUfunction fun;
+  CUmodule mod;
+  int32_t n_regs = 0;
+  int32_t n_spills = 0;
+  int32_t n_max_threads = 0;
+  // create driver handles
+  CUcontext pctx = 0;
+  Py_BEGIN_ALLOW_THREADS;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&pctx));
+  if (!pctx) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(pctx));
+  }
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuModuleLoadData(&mod, data));
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuModuleGetFunction(&fun, mod, name));
+  // get allocated registers and spilled registers from the function
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
+  n_spills /= 4;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
+      &n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
+  // set dynamic shared memory if necessary
+  int shared_optin;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
+      &shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  if (shared > 49152 && shared_optin > 49152) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED));
+    int shared_total, shared_static;
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuDeviceGetAttribute(
+        &shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+        device));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
+        &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+                           shared_optin - shared_static));
+  }
+  Py_END_ALLOW_THREADS;
+  if (PyErr_Occurred()) {
+    return NULL;
+  }
+  return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
+                       n_spills, n_max_threads);
+}
+typedef CUresult (*cuOccupancyMaxActiveClusters_t)(
+    int *numClusters, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (*cuTensorMapEncodeTiled_t)(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill);
+#define defineGetFunctionHandle(name, symbolName)                              \
+  static symbolName##_t name() {                                               \
+    /* Open the shared library */                                              \
+    void *libHandle = dlopen("libcuda.so.1", RTLD_LAZY);                       \
+    if (!libHandle) {                                                          \
+      PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");      \
+      return NULL;                                                             \
+    }                                                                          \
+    /* Clear any existing error */                                             \
+    dlerror();                                                                 \
+    symbolName##_t funcHandle = (symbolName##_t)dlsym(libHandle, #symbolName); \
+    /* Check for errors */                                                     \
+    const char *err = dlerror();                                               \
+    if (err) {                                                                 \
+      PyErr_SetString(PyExc_RuntimeError,                                      \
+                      "Failed to retrieve " #symbolName " from libcuda.so.1"); \
+      dlclose(libHandle);                                                      \
+      return NULL;                                                             \
+    }                                                                          \
+    return funcHandle;                                                         \
+  }
+defineGetFunctionHandle(getCuOccupancyMaxActiveClustersHandle,
+                        cuOccupancyMaxActiveClusters);
+defineGetFunctionHandle(getCuTensorMapEncodeTiledHandle,
+                        cuTensorMapEncodeTiled);
+static PyObject *occupancyMaxActiveClusters(PyObject *self, PyObject *args) {
+  int clusterDim = -1, maxActiveClusters = -1;
+  int shared = 0;
+  CUfunction func;
+  if (!PyArg_ParseTuple(args, "Kii", &func, &shared, &clusterDim)) {
+    return NULL;
+  }
+  // Let each SM have one block
+  int maxActiveBlocks = 1;
+  Py_BEGIN_ALLOW_THREADS;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
+      func, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared));
+  Py_END_ALLOW_THREADS;
+  CUlaunchAttribute launchAttr[1];
+  launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+  launchAttr[0].value.clusterDim.x = clusterDim;
+  launchAttr[0].value.clusterDim.y = 1;
+  launchAttr[0].value.clusterDim.z = 1;
+  CUlaunchConfig config;
+  config.gridDimX = clusterDim * maxActiveBlocks;
+  config.gridDimY = 1;
+  config.gridDimZ = 1;
+  config.blockDimX = 128;
+  config.blockDimY = 1;
+  config.blockDimZ = 1;
+  config.sharedMemBytes = shared;
+  config.hStream = 0;
+  config.numAttrs = 1;
+  config.attrs = launchAttr;
+  static cuOccupancyMaxActiveClusters_t cuOccupancyMaxActiveClusters = NULL;
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuOccupancyMaxActiveClusters,
+                                      getCuOccupancyMaxActiveClustersHandle);
+  Py_BEGIN_ALLOW_THREADS;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncSetAttribute(
+      func, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, 1));
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuOccupancyMaxActiveClusters(&maxActiveClusters, func, &config));
+  Py_END_ALLOW_THREADS;
+  return PyLong_FromLong(maxActiveClusters);
+cleanup:
+  return NULL;
+}
+static PyObject *setPrintfFifoSize(PyObject *self, PyObject *args) {
+  long size;
+  if (!PyArg_ParseTuple(args, "l", &size)) {
+    return NULL;
+  }
+  if (size < 0) {
+    PyErr_SetString(PyExc_ValueError, "fifo size must be non-negative");
+    return NULL;
+  }
+  Py_BEGIN_ALLOW_THREADS;
+  // Ensure we have an active context.
+  CUcontext ctx = NULL;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxGetCurrent(&ctx));
+  if (!ctx) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuDevicePrimaryCtxRetain(&ctx, /*device=*/0));
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuCtxSetCurrent(ctx));
+  }
+  // We can't set the fifo size after running a kernel that calls printf.  This
+  // is true even if the set() call is a nop and the new size is the same as the
+  // old size.
+  //
+  // This is unfriendly, so check if the old size matches the new size, and skip
+  // the set() call if so.
+  size_t oldSize = 0;
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+      cuCtxGetLimit(&oldSize, CU_LIMIT_PRINTF_FIFO_SIZE));
+  if (oldSize != size) {
+    CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
+        cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, size));
+  }
+  Py_END_ALLOW_THREADS;
+  Py_RETURN_NONE;
+}
+static PyObject *PyCUtensorMap_alloc(PyTypeObject *type, Py_ssize_t n_items) {
+  PyCUtensorMapObject *self = NULL;
+  void *mem = NULL;
+  size_t size = type->tp_basicsize;
+  if (posix_memalign(&mem, 128, size) != 0) {
+    PyErr_NoMemory();
+    return NULL;
+  }
+  self = (PyCUtensorMapObject *)mem;
+  PyObject_INIT(self, type);
+  return (PyObject *)self;
+}
+static void PyCUtensorMap_dealloc(PyObject *self) {
+  Py_TYPE(self)->tp_free(self);
+}
+static void PyCUtensorMap_free(void *ptr) { free(ptr); }
+// clang-format off
+static PyTypeObject PyCUtensorMapType = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "triton.backends.nvidia.PyCUtensorMap",
+    .tp_basicsize = sizeof(PyCUtensorMapObject),
+    .tp_itemsize = 0,
+    .tp_flags = Py_TPFLAGS_DEFAULT,
+    .tp_doc = "<PyCUtensorMap object>",
+    .tp_new = PyType_GenericNew,
+    .tp_alloc = PyCUtensorMap_alloc,
+    .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
+    .tp_free = PyCUtensorMap_free,
+};
+// clang-format on
+static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
+  unsigned long long global_address;
+  int swizzle;
+  int elemSize;
+  int elemType;
+  PyObject *blockSize;
+  PyObject *shape;
+  PyObject *strides;
+  int padding;
+  if (!PyArg_ParseTuple(args, "KiiiOOOi", &global_address, &swizzle, &elemSize,
+                        &elemType, &blockSize, &shape, &strides, &padding)) {
+    return NULL;
+  }
+  PyCUtensorMapObject *desc = (PyCUtensorMapObject *)PyObject_CallObject(
+      (PyObject *)&PyCUtensorMapType, NULL);
+  if (!desc) {
+    return NULL;
+  }
+  PyObject *blockSizeFast = NULL;
+  PyObject *shapeFast = NULL;
+  PyObject *stridesFast = NULL;
+  uint32_t blockSizeInt[5];
+  uint64_t shapeInt[5];
+  uint64_t stridesLL[5];
+  blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
+  if (!blockSizeFast)
+    goto cleanup;
+  int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(blockSizeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "block size must be an int");
+      goto cleanup;
+    }
+    blockSizeInt[rank - i - 1] = PyLong_AsLongLong(item);
+  }
+  shapeFast = PySequence_Fast(shape, "shape must be a sequence");
+  if (!shapeFast)
+    goto cleanup;
+  if (rank != PySequence_Fast_GET_SIZE(shapeFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(shapeFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    shapeInt[rank - i - 1] = PyLong_AsLong(item);
+  }
+  stridesFast = PySequence_Fast(strides, "strides must be a sequence");
+  if (!stridesFast)
+    goto cleanup;
+  if (rank != PySequence_Fast_GET_SIZE(stridesFast)) {
+    PyErr_SetString(PyExc_RuntimeError, "Rank mismatch");
+    goto cleanup;
+  }
+  for (int i = 0; i + 1 < rank; ++i) {
+    PyObject *item = PySequence_Fast_GET_ITEM(stridesFast, i);
+    if (!PyLong_Check(item)) {
+      PyErr_SetString(PyExc_TypeError, "shape must be an int");
+      goto cleanup;
+    }
+    stridesLL[rank - i - 2] = elemSize * PyLong_AsLongLong(item);
+  }
+  stridesLL[rank - 1] =
+      shapeInt[rank - 1] * (rank == 1 ? elemSize : stridesLL[rank - 2]);
+  Py_DECREF(blockSizeFast);
+  blockSizeFast = NULL;
+  Py_DECREF(shapeFast);
+  shapeFast = NULL;
+  Py_DECREF(stridesFast);
+  stridesFast = NULL;
+  CUtensorMapFloatOOBfill fill =
+      (padding == 1) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA
+                     : CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE;
+  uint32_t elementStrides[5] = {1, 1, 1, 1, 1};
+  static cuTensorMapEncodeTiled_t cuTensorMapEncodeTiled = NULL;
+  INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
+                                      getCuTensorMapEncodeTiledHandle);
+  CUresult res = cuTensorMapEncodeTiled(
+      &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
+      stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
+      swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
+  if (res != CUDA_SUCCESS) {
+    const char *str;
+    cuGetErrorString(res, &str);
+    char err[4096] = {0};
+    size_t off = 0;
+    off += snprintf(
+        err + off, sizeof(err) - off,
+        "Triton Error [CUDA]: Failed to create tensor map descriptor: %s\n",
+        str ? str : "Unknown error");
+    off += snprintf(err + off, sizeof(err) - off,
+                    "elemType=%d rank=%d global_address=0x%llx elemSize=%d "
+                    "swizzle=%d padding=%d\n",
+                    elemType, rank, (unsigned long long)global_address,
+                    elemSize, swizzle, padding);
+    off += snprintf(err + off, sizeof(err) - off, "shape=[");
+    for (int i = 0; i < rank; ++i) {
+      off +=
+          snprintf(err + off, sizeof(err) - off, "%llu%s",
+                   (unsigned long long)shapeInt[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    off += snprintf(err + off, sizeof(err) - off, "strides=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%llu%s",
+                      (unsigned long long)stridesLL[i],
+                      (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    off += snprintf(err + off, sizeof(err) - off, "blockSize=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%u%s",
+                      (unsigned)blockSizeInt[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "] elementStrides=[");
+    for (int i = 0; i < rank; ++i) {
+      off += snprintf(err + off, sizeof(err) - off, "%u%s",
+                      (unsigned)elementStrides[i], (i + 1 < rank) ? ", " : "");
+    }
+    off += snprintf(err + off, sizeof(err) - off, "]\n");
+    PyErr_SetString(PyExc_RuntimeError, err);
+    goto cleanup;
+  }
+  return (PyObject *)desc;
+cleanup:
+  Py_XDECREF(blockSizeFast);
+  Py_XDECREF(shapeFast);
+  Py_XDECREF(stridesFast);
+  Py_XDECREF(desc);
+  return NULL;
+}
+static PyMethodDef ModuleMethods[] = {
+    {"load_binary", loadBinary, METH_VARARGS,
+     "Load provided cubin into CUDA driver"},
+    {"get_device_properties", getDeviceProperties, METH_VARARGS,
+     "Get the properties for a given device"},
+    {"cuOccupancyMaxActiveClusters", occupancyMaxActiveClusters, METH_VARARGS,
+     "Python interface for cuOccupancyMaxActiveClusters function"},
+    {"set_printf_fifo_size", setPrintfFifoSize, METH_VARARGS,
+     "Python interface for cuCtxSetLimit(CU_LIMIT_PRINTF_FIFO_SIZE, x), which "
+     "controls how many bytes can be streamed from kernels before data starts "
+     "being dropped.  This inherits all the limitations of this call; in "
+     "particular it's an error to change this value after launching any kernel "
+     "that calls printf()."},
+    {"fill_tma_descriptor", fillTMADescriptor, METH_VARARGS, "doc"},
+    {NULL, NULL, 0, NULL} // sentinel
+};
+static struct PyModuleDef ModuleDef = {PyModuleDef_HEAD_INIT, "cuda_utils",
+                                       NULL, // documentation
+                                       -1,   // size
+                                       ModuleMethods};
+PyMODINIT_FUNC PyInit_cuda_utils(void) {
+  if (PyType_Ready(&PyCUtensorMapType) < 0) {
+    return NULL;
+  }
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if (m == NULL) {
+    return NULL;
+  }
+  PyModule_AddFunctions(m, ModuleMethods);
+  Py_INCREF(&PyCUtensorMapType);
+  PyModule_AddObject(m, "PyCUtensorMap", (PyObject *)&PyCUtensorMapType);
+  return m;
+}

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/driver.py ADDED Viewed

	@@ -0,0 +1,764 @@

+import functools
+import os
+import subprocess
+import triton
+import re
+from pathlib import Path
+from triton import knobs
+from triton.runtime.build import compile_module_from_src
+from triton.runtime import _allocation
+from triton.backends.compiler import GPUTarget
+from triton.backends.driver import GPUDriver
+dirname = os.path.dirname(os.path.realpath(__file__))
+include_dirs = [os.path.join(dirname, "include")]
+libdevice_dir = os.path.join(dirname, "lib")
+libraries = ['libcuda.so.1']
+PyCUtensorMap = None
+@functools.lru_cache()
+def libcuda_dirs():
+    if env_libcuda_path := knobs.nvidia.libcuda_path:
+        return [env_libcuda_path]
+    libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode(errors="ignore")
+    # each line looks like the following:
+    # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+    locs = [line.split()[-1] for line in libs.splitlines() if "libcuda.so.1" in line]
+    dirs = [os.path.dirname(loc) for loc in locs]
+    env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+    if env_ld_library_path and not dirs:
+        dirs = [dir for dir in env_ld_library_path.split(":") if os.path.exists(os.path.join(dir, "libcuda.so.1"))]
+    msg = 'libcuda.so cannot found!\n'
+    if locs:
+        msg += 'Possible files are located at %s.' % str(locs)
+        msg += 'Please create a symlink of libcuda.so to any of the files.'
+    else:
+        msg += 'Please make sure GPU is set up and then run "/sbin/ldconfig"'
+        msg += ' (requires sudo) to refresh the linker cache.'
+    assert any(os.path.exists(os.path.join(path, 'libcuda.so.1')) for path in dirs), msg
+    return dirs
+@functools.lru_cache()
+def library_dirs():
+    return [libdevice_dir, *libcuda_dirs()]
+# ------------------------
+# Utils
+# ------------------------
+class CudaUtils(object):
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(CudaUtils, cls).__new__(cls)
+        return cls.instance
+    def __init__(self):
+        mod = compile_module_from_src(
+            src=Path(os.path.join(dirname, "driver.c")).read_text(),
+            name="cuda_utils",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
+        global PyCUtensorMap
+        PyCUtensorMap = mod.PyCUtensorMap
+        self.load_binary = mod.load_binary
+        self.get_device_properties = mod.get_device_properties
+        self.cuOccupancyMaxActiveClusters = mod.cuOccupancyMaxActiveClusters
+        self.set_printf_fifo_size = mod.set_printf_fifo_size
+        self.fill_tma_descriptor = mod.fill_tma_descriptor
+# ------------------------
+# Launcher
+# ------------------------
+def ty_to_cpp(ty):
+    if ty[0] == '*':
+        return "CUdeviceptr"
+    if ty.startswith("tensordesc"):
+        return "CUtensorMap"
+    return {
+        "i1": "int8_t",
+        "i8": "int8_t",
+        "i16": "int16_t",
+        "i32": "int32_t",
+        "i64": "int64_t",
+        "u1": "uint8_t",
+        "u8": "uint8_t",
+        "u16": "uint16_t",
+        "u32": "uint32_t",
+        "u64": "uint64_t",
+        "fp16": "double",
+        "bf16": "double",
+        "fp32": "double",
+        "f32": "double",
+        "fp64": "double",
+        "nvTmaDesc": "CUtensorMap",
+    }[ty]
+FLOAT_STORAGE_TYPE = {
+    "fp16": "uint16_t",
+    "bf16": "uint16_t",
+    "fp32": "uint32_t",
+    "f32": "uint32_t",
+    "fp64": "uint64_t",
+}
+FLOAT_PACK_FUNCTION = {
+    "fp16": "pack_fp16",
+    "bf16": "pack_bf16",
+    "fp32": "pack_fp32",
+    "f32": "pack_fp32",
+    "fp64": "pack_fp64",
+}
+_BASE_ARGS_FORMAT = "iiiKKppOOOOOO"
+_BASE_ARGS_FORMAT_LEN = len(_BASE_ARGS_FORMAT)
+def make_launcher(constants, signature, tensordesc_meta):
+    def _expand_signature(signature):
+        output = []
+        tensordesc_idx = 0
+        # Expand tensor descriptor arguments into either nvTmaDesc, shape and
+        # strides, or base pointer, shape and strides depending on whether the
+        # kernel was lowered to use the nvTmaDesc or not.
+        for sig in signature:
+            if isinstance(sig, str) and sig.startswith("tensordesc"):
+                meta = tensordesc_meta[tensordesc_idx] if tensordesc_meta else None
+                tensordesc_idx += 1
+                match = re.match("tensordesc<([^[>]*)\\[([^]]*)\\]", sig)
+                dtype = match.group(1)
+                shape = match.group(2)
+                ndim = shape.count(",") + 1
+                if meta is None:
+                    output.append("*" + dtype)
+                    # Currently the host side tensor descriptors get passed in as a
+                    # tensor desc, shape, and strides. We have no way to use these
+                    # shape and strides when processing tensor descriptors which is
+                    # why we provide our own decomposition above. Sadly this means
+                    # we have to pass the shape and strides twice.
+                    for _ in range(2 * ndim):
+                        output.append("i64")
+                    output.append("i1")
+                else:
+                    output.append("nvTmaDesc")
+                for _ in range(ndim):
+                    output.append("i32")
+                for _ in range(ndim):
+                    output.append("i64")
+            else:
+                output.append(sig)
+        assert not tensordesc_meta or tensordesc_idx == len(tensordesc_meta)
+        return output
+    def _flatten_signature(sig, output):
+        # Flatten tuples
+        if isinstance(sig, tuple):
+            for x in sig:
+                _flatten_signature(x, output)
+        else:
+            output.append(sig)
+    def _extracted_type(ty):
+        if isinstance(ty, tuple):
+            val = ','.join(map(_extracted_type, ty))
+            return f"[{val}]"
+        if ty[0] == '*':
+            return "PyObject*"
+        if ty in ("constexpr", "nvTmaDesc"):
+            return "PyObject*"
+        return ty_to_cpp(ty)
+    def format_of(ty):
+        if isinstance(ty, tuple):
+            val = ''.join(map(format_of, ty))
+            return f"({val})"
+        if ty[0] == '*':
+            return "O"
+        if ty in ("constexpr", "nvTmaDesc"):
+            return "O"
+        if ty.startswith("tensordesc"):
+            return "O"
+        return {
+            "double": "d",
+            "long": "l",
+            "int8_t": "b",
+            "int16_t": "h",
+            "int32_t": "i",
+            "int64_t": "L",
+            "uint8_t": "B",
+            "uint16_t": "H",
+            "uint32_t": "I",
+            "uint64_t": "K",
+        }[ty_to_cpp(ty)]
+    expand_signature = _expand_signature(signature.values())
+    signature = {i: s for i, s in enumerate(expand_signature)}
+    args_format = ''.join([format_of(ty) for ty in signature.values()])
+    format = _BASE_ARGS_FORMAT + args_format
+    flat_signature = []
+    for sig in signature.values():
+        _flatten_signature(sig, flat_signature)
+    signature = {i: s for i, s in enumerate(flat_signature)}
+    args_list = ', ' + ', '.join(f"&_arg{i}" for i, ty in signature.items()) if len(signature) > 0 else ''
+    # Record the end of regular arguments;
+    # subsequent arguments are architecture-specific descriptors, such as tensor descriptors for CUDA.
+    arg_decl_list = []
+    for i, ty in signature.items():
+        if ty == "constexpr":
+            continue
+        if ty in FLOAT_STORAGE_TYPE:
+            arg_decl_list.append(f"{FLOAT_STORAGE_TYPE[ty]} arg{i}")
+        else:
+            arg_decl_list.append(f"{ty_to_cpp(ty)} arg{i}")
+    arg_decls = ', '.join(arg_decl_list)
+    internal_args_list = []
+    for i, ty in signature.items():
+        if ty[0] == "*":
+            internal_args_list.append(f"ptr_info{i}.dev_ptr")
+        elif ty in FLOAT_STORAGE_TYPE:
+            internal_args_list.append(f"_arg{i}_storage")
+        elif ty == "nvTmaDesc":
+            # Note: we have to dereference the pointer
+            internal_args_list.append(f"*tma_ptr{i}")
+        elif ty != "constexpr":
+            internal_args_list.append(f"_arg{i}")
+    params = range(len(signature))
+    # generate glue code
+    newline = '\n  '
+    ptr_decls = [
+        f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;"
+        for i, ty in signature.items()
+        if ty[0] == "*"
+    ]
+    tma_decls = [
+        f"CUtensorMap* tma_ptr{i} = getTmaDesc(_arg{i}); if (!tma_ptr{i}) return NULL;" for i, ty in signature.items()
+        if ty == "nvTmaDesc"
+    ]
+    float_storage_decls = [
+        f"{FLOAT_STORAGE_TYPE[ty]} _arg{i}_storage = {FLOAT_PACK_FUNCTION[ty]}(_arg{i});"
+        for i, ty in signature.items()
+        if ty in FLOAT_STORAGE_TYPE
+    ]
+    params = [f"&arg{i}" for i, ty in signature.items() if ty != "constexpr"]
+    params.append("&global_scratch")
+    params.append("&profile_scratch")
+    src = f"""
+#include \"cuda.h\"
+#include <dlfcn.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+typedef struct {{
+  PyObject_HEAD;
+  _Alignas(128) CUtensorMap tensorMap;
+}} PyCUtensorMapObject;
+static inline void gpuAssert(CUresult code, const char *file, int line)
+{{
+   if (code != CUDA_SUCCESS)
+   {{
+      const char* prefix = "Triton Error [CUDA]: ";
+      const char* str;
+      cuGetErrorString(code, &str);
+      char err[1024] = {{0}};
+      strcat(err, prefix);
+      strcat(err, str);
+      PyGILState_STATE gil_state;
+      gil_state = PyGILState_Ensure();
+      PyErr_SetString(PyExc_RuntimeError, err);
+      PyGILState_Release(gil_state);
+   }}
+}}
+#define CUDA_CHECK(ans) {{ gpuAssert((ans), __FILE__, __LINE__); }}
+typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);
+static cuLaunchKernelEx_t getLaunchKernelExHandle() {{
+  // Open the shared library
+  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
+  if (!handle) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
+    return NULL;
+  }}
+  // Clear any existing error
+  dlerror();
+  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
+  // Check for errors
+  const char *dlsym_error = dlerror();
+  if (dlsym_error) {{
+    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
+    return NULL;
+  }}
+  return cuLaunchKernelExHandle;
+}}
+static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratch, CUdeviceptr profile_scratch{', ' + arg_decls if len(arg_decls) > 0 else ''}) {{
+  void *params[] = {{ {', '.join(params)} }};
+  if (gridX*gridY*gridZ > 0) {{
+    // 4 attributes that we can currently pass maximum
+    CUlaunchAttribute launchAttr[4];
+    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
+    if (cuLaunchKernelExHandle == NULL) {{
+      cuLaunchKernelExHandle = getLaunchKernelExHandle();
+    }}
+    CUlaunchConfig config;
+    config.gridDimX = gridX * num_ctas;
+    config.gridDimY = gridY;
+    config.gridDimZ = gridZ;
+    config.blockDimX = 32 * num_warps;
+    config.blockDimY = 1;
+    config.blockDimZ = 1;
+    config.sharedMemBytes = shared_memory;
+    config.hStream = stream;
+    config.attrs = launchAttr;
+    int num_attrs = 0;
+    if (launch_pdl != 0) {{
+      CUlaunchAttribute pdlAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1}};
+      launchAttr[num_attrs] = pdlAttr;
+      ++num_attrs;
+    }}
+    if (launch_cooperative_grid != 0) {{
+      CUlaunchAttribute coopAttr = {{ .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1}};
+      launchAttr[num_attrs] = coopAttr;
+      ++num_attrs;
+    }}
+    if (num_ctas != 1) {{
+      CUlaunchAttribute clusterAttr = {{}};
+      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+      clusterAttr.value.clusterDim.x = num_ctas;
+      clusterAttr.value.clusterDim.y = 1;
+      clusterAttr.value.clusterDim.z = 1;
+      launchAttr[num_attrs] = clusterAttr;
+      ++num_attrs;
+      CUlaunchAttribute clusterSchedulingAttr = {{}};
+      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+      launchAttr[num_attrs] = clusterSchedulingAttr;
+      ++num_attrs;
+    }}
+    // num_ctas == 16 is non-portable. Does work for H100 and B200 tho
+    config.numAttrs = num_attrs;
+    if (num_ctas == 16) {{
+      CUDA_CHECK(cuFuncSetAttribute(
+          function,
+          CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
+          1
+      ));
+    }}
+    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
+  }}
+}}
+typedef struct _DevicePtrInfo {{
+    CUdeviceptr dev_ptr;
+    bool valid;
+}} DevicePtrInfo;
+static PyObject* data_ptr_str = NULL;
+static PyObject* py_tensor_map_type = NULL;
+static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {{
+  DevicePtrInfo ptr_info;
+  ptr_info.dev_ptr = 0;
+  ptr_info.valid = true;
+  if (PyLong_Check(obj)) {{
+    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
+    return ptr_info;
+  }}
+  if (obj == Py_None) {{
+    // valid nullptr
+    return ptr_info;
+  }}
+  PyObject *ret = PyObject_CallMethodNoArgs(obj, data_ptr_str);
+  if (!ret) {{
+    PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  if (!PyLong_Check(ret)) {{
+    PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
+    ptr_info.valid = false;
+    goto cleanup;
+  }}
+  ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
+  if(!ptr_info.dev_ptr)
+    return ptr_info;
+  uint64_t dev_ptr;
+  int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
+  if (status == CUDA_ERROR_INVALID_VALUE) {{
+      PyErr_Format(PyExc_ValueError,
+                   "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
+      ptr_info.valid = false;
+  }} else if (status != CUDA_SUCCESS) {{
+      CUDA_CHECK(status);  // Catch any other cuda API errors
+      ptr_info.valid = false;
+  }}
+  ptr_info.dev_ptr = dev_ptr;
+cleanup:
+  Py_XDECREF(ret);
+  return ptr_info;
+}}
+static inline CUtensorMap* getTmaDesc(PyObject *obj) {{
+  if (sizeof(CUtensorMap*) != 8) {{
+    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
+    return NULL;
+  }}
+if (Py_TYPE(obj) != (PyTypeObject*)py_tensor_map_type) {{
+    PyErr_Format(PyExc_TypeError, "object must be of type PyCUtensorMap, got %s", Py_TYPE(obj)->tp_name);
+    return NULL;
+}}
+  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
+  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
+  if (align_128 != 0) {{
+    PyErr_Format(PyExc_ValueError, "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld", align_128);
+    return NULL;
+  }}
+  return map;
+}}
+static void ensureCudaContext() {{
+  CUcontext pctx;
+  CUDA_CHECK(cuCtxGetCurrent(&pctx));
+  if (!pctx) {{
+    // Ensure device context.
+    CUdevice device;
+    CUDA_CHECK(cuDeviceGet(&device, 0));
+    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
+    CUDA_CHECK(cuCtxSetCurrent(pctx));
+  }}
+}}
+static uint16_t pack_fp16(double f) {{
+    uint16_t result;
+    // from https://github.com/python/pythoncapi-compat
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#else
+    PyFloat_Pack2(f, (unsigned char*)&result, 1);
+#endif
+    return result;
+}}
+static uint16_t pack_bf16(double f) {{
+    float f32 = (float)f;
+    uint32_t u32 = *(uint32_t*)&f32;
+    return (uint16_t)(u32 >> 16);
+}}
+static uint32_t pack_fp32(double f) {{
+    float f32 = (float)f;
+    return *(uint32_t*)&f32;
+}}
+static uint64_t pack_fp64(double f) {{
+    return *(uint64_t*)&f;
+}}
+static PyObject* launch(PyObject* self, PyObject* args) {{
+  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
+  ensureCudaContext();
+  int gridX, gridY, gridZ;
+  uint64_t _stream;
+  uint64_t _function;
+  int launch_cooperative_grid;
+  int launch_pdl;
+  PyObject *launch_enter_hook = NULL;
+  PyObject *launch_exit_hook = NULL;
+  PyObject *kernel_metadata = NULL;
+  PyObject *launch_metadata = NULL;
+  PyObject *global_scratch_obj = NULL;
+  PyObject *profile_scratch_obj = NULL;
+  {newline.join([f"{_extracted_type(ty)} _arg{i};" for i, ty in signature.items()])}
+  if(!PyArg_ParseTuple(args, \"{format}\", &gridX, &gridY, &gridZ,
+                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj, &profile_scratch_obj,
+                                           &kernel_metadata, &launch_metadata,
+                                           &launch_enter_hook, &launch_exit_hook{args_list})) {{
+    return NULL;
+  }}
+  int num_warps, num_ctas, shared_memory;
+  if (!PyArg_ParseTuple(kernel_metadata, \"iii\", &num_warps, &num_ctas, &shared_memory)) {{
+    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
+    return NULL;
+  }}
+  // extract launch metadata
+  if (launch_enter_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_enter_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+  CUdeviceptr global_scratch = 0;
+  if (global_scratch_obj != Py_None) {{
+    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
+    if (!global_scratch_info.valid) {{
+      return NULL;
+    }}
+    global_scratch = global_scratch_info.dev_ptr;
+  }}
+  CUdeviceptr profile_scratch = 0;
+  if (profile_scratch_obj != Py_None) {{
+    DevicePtrInfo profile_scratch_info = getPointer(profile_scratch_obj, -1);
+    if (!profile_scratch_info.valid) {{
+      return NULL;
+    }}
+    profile_scratch = profile_scratch_info.dev_ptr;
+  }}
+  // raise exception asap
+  {newline.join(ptr_decls)}
+  {newline.join(tma_decls)}
+  {newline.join(float_storage_decls)}
+  Py_BEGIN_ALLOW_THREADS;
+  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratch, profile_scratch{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
+  Py_END_ALLOW_THREADS;
+  if (PyErr_Occurred()) {{
+    return NULL;
+  }}
+  if(launch_exit_hook != Py_None){{
+    PyObject* ret = PyObject_CallOneArg(launch_exit_hook, launch_metadata);
+    if (!ret)
+      return NULL;
+    Py_DECREF(ret);
+  }}
+  Py_RETURN_NONE;
+}}
+static PyMethodDef ModuleMethods[] = {{
+  {{"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"}},
+  {{NULL, NULL, 0, NULL}} // sentinel
+}};
+static struct PyModuleDef ModuleDef = {{
+  PyModuleDef_HEAD_INIT,
+  \"__triton_launcher\",
+  NULL, //documentation
+  -1, //size
+  ModuleMethods
+}};
+PyMODINIT_FUNC PyInit___triton_launcher(void) {{
+  data_ptr_str = PyUnicode_InternFromString("data_ptr");
+  if(data_ptr_str == NULL) {{
+    return NULL;
+  }}
+  PyObject* driver_mod = PyImport_ImportModule("triton.backends.nvidia.driver");
+  if (driver_mod == NULL) {{
+    return NULL;
+  }}
+  py_tensor_map_type = PyObject_GetAttrString(driver_mod, "PyCUtensorMap");
+  if (py_tensor_map_type == NULL) {{
+    return NULL;
+  }}
+  PyObject *m = PyModule_Create(&ModuleDef);
+  if(m == NULL) {{
+    return NULL;
+  }}
+  PyModule_AddFunctions(m, ModuleMethods);
+  return m;
+}}
+"""
+    return src
+# The TMA dtype enum values are slightly different on host vs device...
+TMA_DTYPE_DEVICE_TO_HOST = dict((i, i) for i in range(16))
+TMA_DTYPE_DEVICE_TO_HOST[8] = 10
+TMA_DTYPE_DEVICE_TO_HOST[9] = 8
+TMA_DTYPE_DEVICE_TO_HOST[10] = 9
+def make_tensordesc_arg(arg, metadata):
+    if metadata is None:
+        # Currently the host side tensor descriptors get decomposed in
+        # the frontend to tensor desc, shape, and strides. We have no
+        # way to use these shape and strides when processing tensor
+        # descriptors which is why we provide our own decomposition
+        # above. Sadly this means we have to pass the shape and strides
+        # twice.
+        return [arg.base, *arg.shape, *arg.strides, arg.padding == "nan", *arg.shape, *arg.strides]
+    swizzle = metadata["swizzle"]
+    elem_size = metadata["elem_size"]
+    elem_type = metadata["elem_type"]
+    block_size = metadata["block_size"]
+    fp4_padded = metadata["fp4_padded"]
+    shape = arg.shape
+    strides = arg.strides
+    assert strides[-1] == 1
+    padding = 1 if arg.padding == "nan" else 0
+    if fp4_padded:
+        shape = list(shape)
+        shape[-1] *= 2
+    cu_tensor_map = triton.runtime.driver.active.utils.fill_tma_descriptor(
+        arg.base.data_ptr(),
+        swizzle,
+        elem_size,
+        TMA_DTYPE_DEVICE_TO_HOST[elem_type],
+        block_size,
+        shape,
+        strides,
+        padding,
+    )
+    return [cu_tensor_map, *shape, *strides]
+def wrap_handle_tensordesc(launcher, signature, tensordesc_meta):
+    has_tensor_desc_arg = any(isinstance(sig, str) and sig.startswith("tensordesc") for sig in signature.values())
+    if not has_tensor_desc_arg:
+        return launcher
+    tensordesc_indices = set(
+        [i for i, sig in enumerate(signature.values()) if isinstance(sig, str) and sig.startswith("tensordesc")])
+    assert not tensordesc_meta or len(tensordesc_meta) == len(tensordesc_indices)
+    if not tensordesc_meta:
+        tensordesc_meta = [None] * len(tensordesc_indices)
+    def inner(*args):
+        final_args = list(args[:_BASE_ARGS_FORMAT_LEN])
+        tensordesc_idx = 0
+        for i, arg in enumerate(args[_BASE_ARGS_FORMAT_LEN:]):
+            if i in tensordesc_indices:
+                final_args.extend(make_tensordesc_arg(arg, tensordesc_meta[tensordesc_idx]))
+                tensordesc_idx += 1
+            else:
+                final_args.append(arg)
+        return launcher(*final_args)
+    return inner
+class CudaLauncher(object):
+    def __init__(self, src, metadata):
+        constants = src.constants if hasattr(src, "constants") else dict()
+        arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
+        constants = {arg_idx(idx): value for idx, value in constants.items()}
+        signature = {idx: value for idx, value in src.signature.items()}
+        tensordesc_meta = getattr(metadata, "tensordesc_meta", None)
+        src = make_launcher(constants, signature, tensordesc_meta)
+        mod = compile_module_from_src(
+            src=src,
+            name="__triton_launcher",
+            library_dirs=library_dirs(),
+            include_dirs=include_dirs,
+            libraries=libraries,
+        )
+        self.num_ctas = getattr(metadata, "num_ctas", 1)
+        self.launch = wrap_handle_tensordesc(mod.launch, signature, tensordesc_meta)
+        self.global_scratch_size = metadata.global_scratch_size
+        self.global_scratch_align = metadata.global_scratch_align
+        self.profile_scratch_size = metadata.profile_scratch_size
+        self.profile_scratch_align = metadata.profile_scratch_align
+        self.launch_cooperative_grid = metadata.launch_cooperative_grid
+        self.launch_pdl = metadata.launch_pdl
+    def __call__(self, gridX, gridY, gridZ, stream, function, *args):
+        def allocate_scratch(size, align, allocator):
+            if size > 0:
+                grid_size = gridX * gridY * gridZ
+                alloc_size = grid_size * self.num_ctas * size
+                alloc_fn = allocator.get()
+                return alloc_fn(alloc_size, align, stream)
+            return None
+        global_scratch = allocate_scratch(self.global_scratch_size, self.global_scratch_align, _allocation._allocator)
+        profile_scratch = allocate_scratch(self.profile_scratch_size, self.profile_scratch_align,
+                                           _allocation._profile_allocator)
+        self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, self.launch_pdl,
+                    global_scratch, profile_scratch, *args)
+class CudaDriver(GPUDriver):
+    def __init__(self):
+        self.utils = CudaUtils()  # TODO: make static
+        self.launcher_cls = CudaLauncher
+        super().__init__()
+    def get_current_target(self):
+        device = self.get_current_device()
+        capability = self.get_device_capability(device)
+        capability = capability[0] * 10 + capability[1]
+        warp_size = 32
+        return GPUTarget("cuda", capability, warp_size)
+    def get_active_torch_device(self):
+        import torch
+        return torch.device("cuda", self.get_current_device())
+    def get_device_interface(self):
+        import torch
+        return torch.cuda
+    @staticmethod
+    def is_active():
+        try:
+            import torch
+            return torch.cuda.is_available() and (torch.version.hip is None)
+        except ImportError:
+            return False
+    def map_python_to_cpp_type(self, ty: str) -> str:
+        return ty_to_cpp(ty)
+    def get_benchmarker(self):
+        from triton.testing import do_bench
+        return do_bench
+    def get_empty_cache_for_benchmark(self):
+        import torch
+        # We maintain a buffer of 256 MB that we clear
+        # before each kernel call to make sure that the L2 cache
+        # doesn't contain any input data before the run
+        cache_size = 256 * 1024 * 1024
+        return torch.empty(int(cache_size // 4), dtype=torch.int, device='cuda')
+    def clear_cache(self, cache):
+        cache.zero_()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cudaGL.h ADDED Viewed

	@@ -0,0 +1,608 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAGL_H
+#define CUDAGL_H
+#include <cuda.h>
+#include <GL/gl.h>
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+#define cuGLCtxCreate            cuGLCtxCreate_v2
+#define cuGLMapBufferObject      __CUDA_API_PTDS(cuGLMapBufferObject_v2)
+#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
+#define cuGLGetDevices           cuGLGetDevices_v2
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+/**
+ * \defgroup CUDA_GL OpenGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping
+ * of OpenGL resources is performed with the graphics API agnostic, resource
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+#if defined(_WIN32)
+#if !defined(WGL_NV_gpu_affinity)
+typedef void* HGPUNV;
+#endif
+#endif /* _WIN32 */
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * pCudaResource.  The register flags \p Flags specify the intended usage,
+ * as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param buffer - name of buffer object to be registered
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsGLRegisterBuffer
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ * A handle to the registered object is returned as \p pCudaResource.
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D,
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY,
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p Flags specify the intended usage, as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param image - name of texture or renderbuffer object to be registered
+ * \param target - Identifies the type of object specified by \p image
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+#ifdef _WIN32
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
+ * applicable.
+ *
+ * \param pDevice - Device associated with hGpu
+ * \param hGpu    - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
+ * ::cuGLSetBufferObjectMapFlags,
+ * ::cudaWGLGetDevice
+ */
+CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
+#endif /* _WIN32 */
+/**
+ * CUDA devices corresponding to an OpenGL device
+ */
+typedef enum CUGLDeviceList_enum {
+    CU_GL_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+    CU_GL_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+    CU_GL_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
+} CUGLDeviceList;
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices
+ * at most cudaDeviceCount of the CUDA-compatible devices corresponding to
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
+ *
+ * The \p deviceList argument may be any of the following:
+ * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
+ * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the current frame (in SLI).
+ * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
+ *   this is correct in all cases.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices.
+ * \param pCudaDevices     - Returned CUDA devices.
+ * \param cudaDeviceCount  - The size of the output device array pCudaDevices.
+ * \param deviceList       - The set of devices to return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ *
+ * \notefnerr
+ *
+ * \sa
+ * ::cuWGLGetDevice,
+ * ::cudaGLGetDevices
+ */
+CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+/**
+ * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+/** Flags to map or unmap a resource */
+typedef enum CUGLmap_flags_enum {
+    CU_GL_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,
+} CUGLmap_flags;
+/**
+ * \brief Create a CUDA context for interoperability with OpenGL
+ *
+ * \deprecated This function is deprecated as of Cuda 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx   - Returned CUDA context
+ * \param Flags  - Options for CUDA context creation
+ * \param device - Device on which to create the context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+/**
+ * \brief Initializes OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Initializes OpenGL interoperability. This function is deprecated
+ * and calling it is no longer required. It may fail if the needed
+ * OpenGL driver facilities are not available.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  There must be a valid OpenGL context bound to the current
+ * thread when this function is called, and the buffer name is
+ * resolved by that context.
+ *
+ * \param buffer - The name of the buffer object to register.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsGLRegisterBuffer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param dptr   - Returned mapped base pointer
+ * \param size   - Returned size of mapping
+ * \param buffer - The name of the buffer object to map
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size,  GLuint buffer);
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param buffer - Buffer object to unmap
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
+/**
+ * \brief Unregister an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Unregisters the buffer object specified by \p buffer.  This
+ * releases any resources associated with the registered buffer.
+ * After this call, the buffer may no longer be mapped for access by
+ * CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Name of the buffer object to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnregisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
+/**
+ * \brief Set the map flags for an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Sets the map flags for the buffer object specified by \p buffer.
+ *
+ * Changes to \p Flags will take effect the next time \p buffer is mapped.
+ * The \p Flags argument may be any of the following:
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels. This is the default value.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p buffer has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
+ * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Buffer object to unmap
+ * \param Flags  - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceSetMapFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param dptr    - Returned mapped base pointer
+ * \param size    - Returned size of mapping
+ * \param buffer  - The name of the buffer object to map
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param buffer  - Name of the buffer object to unmap
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
+/** @} */ /* END CUDA_GL_DEPRECATED */
+/** @} */ /* END CUDA_GL */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuGLCtxCreate
+    #undef cuGLMapBufferObject
+    #undef cuGLMapBufferObjectAsync
+    #undef cuGLGetDevices
+    CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+    CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+    CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+    CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer, CUstream hStream);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+#ifdef __cplusplus
+};
+#endif
+#undef __CUDA_DEPRECATED
+#endif

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling_util.h ADDED Viewed

	@@ -0,0 +1,402 @@

+#if !defined(_CUPTI_PCSAMPLING_UTIL_H_)
+#define _CUPTI_PCSAMPLING_UTIL_H_
+#include <cupti_pcsampling.h>
+#include <fstream>
+#include <cupti_common.h>
+#ifndef CUPTI_UTIL_STRUCT_SIZE
+#define CUPTI_UTIL_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+#ifndef CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS
+#define CHECK_PC_SAMPLING_STRUCT_FIELD_EXISTS(type, member, structSize)    \
+    (offsetof(type, member) < structSize)
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__GNUC__)
+    #pragma GCC visibility push(default)
+#endif
+namespace CUPTI { namespace PcSamplingUtil {
+/**
+ * \defgroup CUPTI_PCSAMPLING_UTILITY CUPTI PC Sampling Utility API
+ * Functions, types, and enums that implement the CUPTI PC Sampling Utility API.
+ * @{
+ */
+/**
+ * \brief Header info will be stored in file.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Version of file format.
+   */
+  uint32_t version;
+  /**
+   * Total number of buffers present in the file.
+   */
+  uint32_t totalBuffers;
+} Header;
+/**
+ * \brief BufferInfo will be stored in the file for every buffer
+ *  i.e for every call of UtilDumpPcSamplingBufferInFile() API.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Total number of PC records.
+   */
+  uint64_t recordCount;
+  /**
+   * Count of all stall reasons supported on the GPU
+   */
+  size_t numStallReasons;
+  /**
+   * Total number of stall reasons in single record.
+   */
+  uint64_t numSelectedStallReasons;
+  /**
+   * Buffer size in Bytes.
+   */
+  uint64_t bufferByteSize;
+} BufferInfo;
+/**
+ * \brief All available stall reasons name and respective indexes
+ * will be stored in it.
+ */
+typedef struct PACKED_ALIGNMENT {
+  /**
+   * Number of all available stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * Stall reasons names of all available stall reasons
+   */
+  char **stallReasons;
+  /**
+   * Stall reason index of all available stall reasons
+   */
+  uint32_t *stallReasonIndex;
+} PcSamplingStallReasons;
+/**
+ * \brief CUPTI PC sampling buffer types.
+ *
+ */
+typedef enum {
+  /**
+   * Invalid buffer type.
+   */
+  PC_SAMPLING_BUFFER_INVALID             = 0,
+  /**
+   * Refers to CUpti_PCSamplingData buffer.
+   */
+  PC_SAMPLING_BUFFER_PC_TO_COUNTER_DATA  = 1
+} PcSamplingBufferType;
+/**
+ * \brief CUPTI PC sampling utility API result codes.
+ *
+ * Error and result codes returned by CUPTI PC sampling utility API.
+ */
+typedef enum {
+  /**
+   * No error
+   */
+  CUPTI_UTIL_SUCCESS                                       = 0,
+  /**
+   * One or more of the parameters are invalid.
+   */
+  CUPTI_UTIL_ERROR_INVALID_PARAMETER                       = 1,
+  /**
+   * Unable to create a new file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE                   = 2,
+  /**
+   * Unable to open a file
+   */
+  CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE                     = 3,
+  /**
+   * Read or write operation failed
+   */
+  CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED             = 4,
+  /**
+   * Provided file handle is corrupted.
+   */
+  CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED                   = 5,
+  /**
+   * seek operation failed.
+   */
+  CUPTI_UTIL_ERROR_SEEK_OPERATION_FAILED                   = 6,
+  /**
+   * Unable to allocate enough memory to perform the requested
+   * operation.
+   */
+  CUPTI_UTIL_ERROR_OUT_OF_MEMORY                           = 7,
+  /**
+   * An unknown internal error has occurred.
+   */
+  CUPTI_UTIL_ERROR_UNKNOWN                                 = 999,
+  CUPTI_UTIL_ERROR_FORCE_INT                               = 0x7fffffff
+} CUptiUtilResult;
+/**
+ * \brief Params for \ref CuptiUtilPutPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * PC sampling buffer.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configured attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   * It is expected to provide configuration details of at least
+   * CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON attribute.
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+  /**
+   * File name to store buffer into it.
+   */
+  const char* fileName;
+} CUptiUtil_PutPcSampDataParams;
+#define CUptiUtil_PutPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_PutPcSampDataParams, fileName)
+/**
+ * \brief Dump PC sampling data into the file.
+ *
+ * This API can be called multiple times.
+ * It will append buffer in the file.
+ * For every buffer it will store BufferInfo
+ * so that before retrieving data it will help to allocate buffer
+ * to store retrieved data.
+ * This API creates file if file does not present.
+ * If stallReasonIndex or stallReasons pointer of \ref CUptiUtil_PutPcSampDataParams is NULL
+ * then stall reasons data  will not be stored in file.
+ * It is expected to store all available stall reason data at least once to refer it during
+ * offline correlation.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSamplingData, pParams pointer is NULL or stall reason configuration details not provided
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_CREATE_FILE
+ * \retval CUPTI_UTIL_ERROR_UNABLE_TO_OPEN_FILE
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilPutPcSampData(CUptiUtil_PutPcSampDataParams *pParams);
+/**
+ * \brief Params for \ref CuptiUtilGetHeaderData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Header Info.
+   */
+  Header headerInfo;
+} CUptiUtil_GetHeaderDataParams;
+#define CUptiUtil_GetHeaderDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetHeaderDataParams, headerInfo)
+/**
+ * \brief Get header data of file.
+ *
+ * This API must be called once initially while retrieving data from file.
+ * \ref Header structure, it gives info about total number
+ * of buffers present in the file.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED  failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetHeaderData(CUptiUtil_GetHeaderDataParams *pParams);
+/**
+ * \brief Params for \ref CuptiUtilGetBufferInfo
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Buffer Info.
+   */
+  BufferInfo bufferInfoData;
+} CUptiUtil_GetBufferInfoParams;
+#define CUptiUtil_GetBufferInfoParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetBufferInfoParams, bufferInfoData)
+/**
+ * \brief Get buffer info data of file.
+ *
+ * This API must be called every time before calling CuptiUtilGetPcSampData API.
+ * \ref BufferInfo structure, it gives info about recordCount and stallReasonCount
+ * of every record in the buffer. This will help to allocate exact buffer to retrieve data into it.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if either of pParam or fileHandle is NULL or param struct size is incorrect.
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED failed to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetBufferInfo(CUptiUtil_GetBufferInfoParams *pParams);
+/**
+ * \brief Params for \ref CuptiUtilGetPcSampData
+ */
+typedef struct {
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * File handle.
+   */
+  std::ifstream *fileHandler;
+  /**
+   * Type of buffer to store in file
+   */
+  PcSamplingBufferType bufferType;
+  /**
+   * Pointer to collected buffer info using \ref CuptiUtilGetBufferInfo
+   */
+  BufferInfo *pBufferInfoData;
+  /**
+   * Pointer to allocated memory to store retrieved data from file.
+   */
+  void *pSamplingData;
+  /**
+   * Number of configuration attributes
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+  /**
+   * Refer \ref PcSamplingStallReasons.
+   * For stallReasons field of \ref PcSamplingStallReasons it is expected to
+   * allocate memory for each string element of array.
+   */
+  PcSamplingStallReasons *pPcSamplingStallReasons;
+} CUptiUtil_GetPcSampDataParams;
+#define CUptiUtil_GetPcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_GetPcSampDataParams, pPcSamplingStallReasons)
+/**
+ * \brief Retrieve PC sampling data from file into allocated buffer.
+ *
+ * This API must be called after CuptiUtilGetBufferInfo API.
+ * It will retrieve data from file into allocated buffer.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if buffer type is invalid
+ * or if either of pSampData, pParams is NULL. If pPcSamplingStallReasons is not NULL then
+ * error out if either of stallReasonIndex, stallReasons or stallReasons array element pointer is NULL.
+ * or filename is empty.
+ * \retval CUPTI_UTIL_ERROR_READ_WRITE_OPERATION_FAILED
+ * \retval CUPTI_UTIL_ERROR_FILE_HANDLE_CORRUPTED file handle is not in good state to read data from file.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilGetPcSampData(CUptiUtil_GetPcSampDataParams *pParams);
+/**
+ * \brief Params for \ref CuptiUtilMergePcSampData
+ */
+typedef struct
+{
+  /**
+   * Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * Number of buffers to merge.
+   */
+  size_t numberOfBuffers;
+  /**
+   * Pointer to array of buffers to merge
+   */
+  CUpti_PCSamplingData *PcSampDataBuffer;
+  /**
+   * Pointer to array of merged buffers as per the range id.
+   */
+  CUpti_PCSamplingData **MergedPcSampDataBuffers;
+  /**
+   * Number of merged buffers.
+   */
+  size_t *numMergedBuffer;
+} CUptiUtil_MergePcSampDataParams;
+#define CUptiUtil_MergePcSampDataParamsSize                   CUPTI_UTIL_STRUCT_SIZE(CUptiUtil_MergePcSampDataParams, numMergedBuffer)
+/**
+ * \brief Merge PC sampling data range id wise.
+ *
+ * This API merge PC sampling data range id wise.
+ * It allocates memory for merged data and fill data in it
+ * and provide buffer pointer in MergedPcSampDataBuffers field.
+ * It is expected from user to free merge data buffers after use.
+ *
+ * \retval CUPTI_UTIL_SUCCESS
+ * \retval CUPTI_UTIL_ERROR_INVALID_PARAMETER error out if param struct size is invalid
+ * or count of buffers to merge is invalid i.e less than 1
+ * or either of PcSampDataBuffer, MergedPcSampDataBuffers, numMergedBuffer is NULL
+ * \retval CUPTI_UTIL_ERROR_OUT_OF_MEMORY Unable to allocate memory for merged buffer.
+ */
+CUptiUtilResult CUPTIUTILAPI CuptiUtilMergePcSampData(CUptiUtil_MergePcSampDataParams *pParams);
+/** @} */ /* END CUPTI_PCSAMPLING_UTILITY */
+} }
+#if defined(__GNUC__)
+    #pragma GCC visibility pop
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/driver_types.h ADDED Viewed

The diff for this file is too large to render. See raw diff

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/generated_cudaVDPAU_meta.h ADDED Viewed

	@@ -0,0 +1,46 @@

+// This file is generated.  Any changes you make will be lost during the next clean build.
+// Dependent includes
+#include <vdpau/vdpau.h>
+// CUDA public interface, for type definitions and cu* function prototypes
+#include "cudaVDPAU.h"
+// *************************************************************************
+//      Definitions of structs to hold parameters for each function
+// *************************************************************************
+typedef struct cuVDPAUGetDevice_params_st {
+    CUdevice *pDevice;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUGetDevice_params;
+typedef struct cuVDPAUCtxCreate_v2_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_v2_params;
+typedef struct cuGraphicsVDPAURegisterVideoSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpVideoSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterVideoSurface_params;
+typedef struct cuGraphicsVDPAURegisterOutputSurface_params_st {
+    CUgraphicsResource *pCudaResource;
+    VdpOutputSurface vdpSurface;
+    unsigned int flags;
+} cuGraphicsVDPAURegisterOutputSurface_params;
+typedef struct cuVDPAUCtxCreate_params_st {
+    CUcontext *pCtx;
+    unsigned int flags;
+    CUdevice device;
+    VdpDevice vdpDevice;
+    VdpGetProcAddress *vdpGetProcAddress;
+} cuVDPAUCtxCreate_params;

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_target.h ADDED Viewed

	@@ -0,0 +1,626 @@

+#ifndef NVPERF_TARGET_H
+#define NVPERF_TARGET_H
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ *  @file   nvperf_target.h
+ */
+#ifndef NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+#define NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+    /// GPU architecture support level
+    typedef enum NVPW_GpuArchitectureSupportLevel
+    {
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_SUPPORTED
+    } NVPW_GpuArchitectureSupportLevel;
+#endif //NVPW_GPU_ARCHITECTURE_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_SLI_SUPPORT_LEVEL_DEFINED
+#define NVPW_SLI_SUPPORT_LEVEL_DEFINED
+    /// SLI configuration support level
+    typedef enum NVPW_SliSupportLevel
+    {
+        NVPW_SLI_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_SLI_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Only Non-SLI configurations are supported.
+        NVPW_SLI_SUPPORT_LEVEL_SUPPORTED_NON_SLI_CONFIGURATION
+    } NVPW_SliSupportLevel;
+#endif //NVPW_SLI_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+#define NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+    /// Virtualized GPU configuration support level
+    typedef enum NVPW_VGpuSupportLevel
+    {
+        NVPW_VGPU_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_VGPU_SUPPORT_LEVEL_UNSUPPORTED,
+        /// Supported but not allowed by system admin.
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_DISALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_ALLOWED,
+        NVPW_VGPU_SUPPORT_LEVEL_SUPPORTED_NON_VGPU_CONFIGURATION
+    } NVPW_VGpuSupportLevel;
+#endif //NVPW_VGPU_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+#define NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+    /// Confidential Compute mode support level
+    typedef enum NVPW_ConfidentialComputeSupportLevel
+    {
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_NON_CONF_COMPUTE_CONFIGURATION,
+        NVPW_CONF_COMPUTE_SUPPORT_LEVEL_SUPPORTED_CONF_COMPUTE_DEVTOOLS_MODE
+    } NVPW_ConfidentialComputeSupportLevel;
+#endif //NVPW_CONF_COMPUTE_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_CMP_SUPPORT_LEVEL_DEFINED
+#define NVPW_CMP_SUPPORT_LEVEL_DEFINED
+    /// CMP support level
+    typedef enum NVPW_CmpSupportLevel
+    {
+        NVPW_CMP_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_CMP_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_CMP_SUPPORT_LEVEL_SUPPORTED_NON_CMP_CONFIGURATON
+    } NVPW_CmpSupportLevel;
+#endif //NVPW_CMP_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_WSL_SUPPORT_LEVEL_DEFINED
+#define NVPW_WSL_SUPPORT_LEVEL_DEFINED
+    /// WSL support level
+    typedef enum NVPW_WslSupportLevel
+    {
+        NVPW_WSL_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_WSL_SUPPORT_LEVEL_UNSUPPORTED_INSUFFICIENT_DRIVER_VERSION,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED,
+        NVPW_WSL_SUPPORT_LEVEL_SUPPORTED_NON_WSL_CONFIGURATION
+    } NVPW_WslSupportLevel;
+#endif //NVPW_WSL_SUPPORT_LEVEL_DEFINED
+#ifndef NVPW_MIG_SUPPORT_LEVEL_DEFINED
+#define NVPW_MIG_SUPPORT_LEVEL_DEFINED
+    /// MIG support level
+    typedef enum NVPW_MigSupportLevel
+    {
+        NVPW_MIG_SUPPORT_LEVEL_UNKNOWN = 0,
+        NVPW_MIG_SUPPORT_LEVEL_UNSUPPORTED,
+        NVPW_MIG_SUPPORT_LEVEL_SUPPORTED,
+        NVPW_MIG_SUPPORT_LEVEL_SUPPORTED_NON_MIG_CONFIGURATION
+    } NVPW_MigSupportLevel;
+#endif //NVPW_MIG_SUPPORT_LEVEL_DEFINED
+    typedef struct NVPW_InitializeTarget_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+    } NVPW_InitializeTarget_Params;
+#define NVPW_InitializeTarget_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_InitializeTarget_Params, pPriv)
+    /// Load the target library.
+    NVPA_Status NVPW_InitializeTarget(NVPW_InitializeTarget_Params* pParams);
+    typedef struct NVPW_GetDeviceCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t numDevices;
+    } NVPW_GetDeviceCount_Params;
+#define NVPW_GetDeviceCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_GetDeviceCount_Params, numDevices)
+    NVPA_Status NVPW_GetDeviceCount(NVPW_GetDeviceCount_Params* pParams);
+    typedef struct NVPW_Device_GetNames_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        const char* pDeviceName;
+        const char* pChipName;
+    } NVPW_Device_GetNames_Params;
+#define NVPW_Device_GetNames_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetNames_Params, pChipName)
+    NVPA_Status NVPW_Device_GetNames(NVPW_Device_GetNames_Params* pParams);
+    typedef struct NVPW_PciBusId
+    {
+        /// The PCI domain on which the device bus resides.
+        uint32_t domain;
+        ///  The bus on which the device resides.
+        uint16_t bus;
+        /// device ID.
+        uint16_t device;
+    } NVPW_PciBusId;
+#define NVPW_PciBusId_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PciBusId, device)
+    typedef struct NVPW_Device_GetPciBusIds_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] caller-allocated array of NVPW_PciBusId, indexed by NVPW deviceIndex
+        NVPW_PciBusId* pBusIds;
+        /// [in] size of the pBusIDs array; use result from NVPW_GetDeviceCount
+        size_t numDevices;
+    } NVPW_Device_GetPciBusIds_Params;
+#define NVPW_Device_GetPciBusIds_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetPciBusIds_Params, numDevices)
+    NVPA_Status NVPW_Device_GetPciBusIds(NVPW_Device_GetPciBusIds_Params* pParams);
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_INVALID     0xFFFFFFFFu
+#define NVPW_DEVICE_MIG_GPU_INSTANCE_ID_FULLCHIP    0xFFFFFFFEu
+    typedef struct NVPW_Device_GetMigAttributes_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        size_t deviceIndex;
+        /// [out]
+        NVPA_Bool isMigPartition;
+        /// [out]
+        uint32_t gpuInstanceId;
+        /// [out]
+        uint32_t computeInstanceId;
+    } NVPW_Device_GetMigAttributes_Params;
+#define NVPW_Device_GetMigAttributes_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetMigAttributes_Params, computeInstanceId)
+    NVPA_Status NVPW_Device_GetMigAttributes(NVPW_Device_GetMigAttributes_Params* pParams);
+    typedef struct NVPW_Adapter_GetDeviceIndex_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        struct IDXGIAdapter* pAdapter;
+        /// [in]
+        size_t sliIndex;
+        /// [out]
+        size_t deviceIndex;
+    } NVPW_Adapter_GetDeviceIndex_Params;
+#define NVPW_Adapter_GetDeviceIndex_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Adapter_GetDeviceIndex_Params, deviceIndex)
+    NVPA_Status NVPW_Adapter_GetDeviceIndex(NVPW_Adapter_GetDeviceIndex_Params* pParams);
+    typedef struct NVPW_CounterData_GetNumRanges_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t numRanges;
+    } NVPW_CounterData_GetNumRanges_Params;
+#define NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetNumRanges_Params, numRanges)
+    NVPA_Status NVPW_CounterData_GetNumRanges(NVPW_CounterData_GetNumRanges_Params* pParams);
+    typedef struct NVPW_CounterData_GetChipName_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        const char* pChipName;
+    } NVPW_CounterData_GetChipName_Params;
+#define NVPW_CounterData_GetChipName_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetChipName_Params, pChipName)
+    NVPA_Status NVPW_CounterData_GetChipName(NVPW_CounterData_GetChipName_Params* pParams);
+    typedef struct NVPW_Config_GetNumPasses_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPipelinedPasses;
+        /// [out]
+        size_t numIsolatedPasses;
+    } NVPW_Config_GetNumPasses_Params;
+#define NVPW_Config_GetNumPasses_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_Params, numIsolatedPasses)
+    /// Total num passes = numPipelinedPasses + numIsolatedPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses(NVPW_Config_GetNumPasses_Params* pParams);
+    typedef struct NVPW_Config_GetNumPasses_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pConfig;
+        /// [out]
+        size_t numPasses;
+    } NVPW_Config_GetNumPasses_V2_Params;
+#define NVPW_Config_GetNumPasses_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Config_GetNumPasses_V2_Params, numPasses)
+    /// Total num passes = numPasses * numNestingLevels
+    NVPA_Status NVPW_Config_GetNumPasses_V2(NVPW_Config_GetNumPasses_V2_Params* pParams);
+#define NVPW_API_SET_CUDA_PROFILER             0x18209d0775b2f89dULL
+#define NVPW_API_SET_D3D11_PROFILER            0xca55c6738445db2bULL
+#define NVPW_API_SET_D3D12_PROFILER            0xc0c2d46dd7c7ad78ULL
+#define NVPW_API_SET_EGL_PROFILER              0x3c3747dae1f9565cULL
+#define NVPW_API_SET_GPU_PERIODICSAMPLER       0x9f4c2571fc0b2e8aULL
+#define NVPW_API_SET_METRICSEVALUATOR          0x0368a8768d811af9ULL
+#define NVPW_API_SET_METRICS_AD10X_COMP        0xbe57278e12cb5288ULL
+#define NVPW_API_SET_METRICS_AD10X_GRFX        0x5cbf0774f81bf491ULL
+#define NVPW_API_SET_METRICS_GA100_COMP        0x16b7d8c20d8b4915ULL
+#define NVPW_API_SET_METRICS_GA100_GRFX        0xc94eaabec04a94faULL
+#define NVPW_API_SET_METRICS_GA10X_COMP        0xb5d6391c2e299ab5ULL
+#define NVPW_API_SET_METRICS_GA10X_GRFX        0x6ebc121178b5ce0bULL
+#define NVPW_API_SET_METRICS_GV100_COMP        0x863705cc57919f72ULL
+#define NVPW_API_SET_METRICS_GV100_GRFX        0x9900da75d164fecfULL
+#define NVPW_API_SET_METRICS_GV11B_COMP        0xd3f79a859235848fULL
+#define NVPW_API_SET_METRICS_GV11B_GRFX        0xeb8e26220106e227ULL
+#define NVPW_API_SET_METRICS_TU10X_COMP        0x70f40be0afd35da8ULL
+#define NVPW_API_SET_METRICS_TU10X_GRFX        0xdf219cb838db6968ULL
+#define NVPW_API_SET_METRICS_TU11X_COMP        0xeb0069d7d0956678ULL
+#define NVPW_API_SET_METRICS_TU11X_GRFX        0x0977d9342bd62743ULL
+#define NVPW_API_SET_OPENGL_PROFILER           0xe4cd9ea40f2ee777ULL
+#define NVPW_API_SET_VULKAN_PROFILER           0x8c56b6a03d779689ULL
+#define NVPW_SDK_VERSION               0x1e128b6f001423fcULL
+    typedef struct NVPW_QueryVersionNumber_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint64_t apiSet;
+        /// [out]
+        uint32_t major;
+        /// [out]
+        uint32_t minor;
+        /// [out]
+        uint32_t patch;
+        /// [out]
+        uint32_t relMajor;
+        /// [out]
+        uint32_t relMinor;
+        /// [out]
+        uint32_t relPatch;
+    } NVPW_QueryVersionNumber_Params;
+#define NVPW_QueryVersionNumber_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_QueryVersionNumber_Params, relPatch)
+    /// Query version number of an API set
+    NVPA_Status NVPW_QueryVersionNumber(NVPW_QueryVersionNumber_Params* pParams);
+    typedef enum NVPW_Device_ClockStatus
+    {
+        /// clock status is unknown
+        NVPW_DEVICE_CLOCK_STATUS_UNKNOWN,
+        /// clocks are locked to rated tdp values - Deprecated, use NVPW_DEVICE_CLOCK_STATUS_LOCKED instead
+        NVPW_DEVICE_CLOCK_STATUS_LOCKED_TO_RATED_TDP,
+        /// clocks are not locked and can boost above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_ENABLED,
+        /// clocks are not locked and will not go above rated tdp
+        NVPW_DEVICE_CLOCK_STATUS_BOOST_DISABLED,
+        /// clocks are locked
+        NVPW_DEVICE_CLOCK_STATUS_LOCKED,
+        /// clocks are not locked
+        NVPW_DEVICE_CLOCK_STATUS_UNLOCKED,
+        NVPW_DEVICE_CLOCK_STATUS__COUNT
+    } NVPW_Device_ClockStatus;
+    typedef enum NVPW_Device_ClockLevel
+    {
+        /// clock level is invalid
+        NVPW_DEVICE_CLOCK_LEVEL_INVALID,
+        /// clock level is at rated tdp
+        NVPW_DEVICE_CLOCK_LEVEL_RATED_TDP,
+        /// clock level is at turbo boost
+        NVPW_DEVICE_CLOCK_LEVEL_TURBO_BOOST,
+        NVPW_DEVICE_CLOCK_LEVEL__COUNT
+    } NVPW_Device_ClockLevel;
+    typedef struct NVPW_Device_GetClockStatus_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockStatus clockStatus;
+        /// [in]
+        NVPW_Device_ClockLevel clockLevel;
+    } NVPW_Device_GetClockStatus_Params;
+#define NVPW_Device_GetClockStatus_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_GetClockStatus_Params, clockLevel)
+    NVPA_Status NVPW_Device_GetClockStatus(NVPW_Device_GetClockStatus_Params* pParams);
+    typedef enum NVPW_Device_ClockSetting
+    {
+        /// invalid op, specify valid clocks operation during profiling
+        NVPW_DEVICE_CLOCK_SETTING_INVALID,
+        /// default to driver/application config (normally unlocked and not boosted, but could be unlocked boosted, or
+        /// locked to rated TDP)
+        NVPW_DEVICE_CLOCK_SETTING_DEFAULT,
+        /// lock clocks at rated tdp base values
+        NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_RATED_TDP,
+        /// lock clocks at turbo boost values
+        NVPW_DEVICE_CLOCK_SETTING_LOCK_TO_TURBO_BOOST,
+        NVPW_DEVICE_CLOCK_SETTING__COUNT
+    } NVPW_Device_ClockSetting;
+    typedef struct NVPW_Device_SetClockSetting_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        size_t deviceIndex;
+        /// [in]
+        NVPW_Device_ClockSetting clockSetting;
+    } NVPW_Device_SetClockSetting_Params;
+#define NVPW_Device_SetClockSetting_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Device_SetClockSetting_Params, clockSetting)
+    NVPA_Status NVPW_Device_SetClockSetting(NVPW_Device_SetClockSetting_Params* pParams);
+    typedef struct NVPW_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_CounterData_GetRangeDescriptions_Params;
+#define NVPW_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+    NVPA_Status NVPW_CounterData_GetRangeDescriptions(NVPW_CounterData_GetRangeDescriptions_Params* pParams);
+    typedef struct NVPW_Profiler_CounterData_GetRangeDescriptions_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        const uint8_t* pCounterDataImage;
+        size_t rangeIndex;
+        /// [inout] Number of descriptions allocated in ppDescriptions
+        size_t numDescriptions;
+        const char** ppDescriptions;
+    } NVPW_Profiler_CounterData_GetRangeDescriptions_Params;
+#define NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_Profiler_CounterData_GetRangeDescriptions_Params, ppDescriptions)
+    NVPA_Status NVPW_Profiler_CounterData_GetRangeDescriptions(NVPW_Profiler_CounterData_GetRangeDescriptions_Params* pParams);
+#ifndef NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+#define NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+    typedef enum NVPW_PeriodicSampler_CounterData_AppendMode
+    {
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_LINEAR = 0,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_CIRCULAR = 1,
+        NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE__COUNT
+    } NVPW_PeriodicSampler_CounterData_AppendMode;
+#endif //NVPW_PERIODIC_SAMPLER_COUNTER_DATA_APPEND_MODE_DEFINED
+    typedef struct NVPW_PeriodicSampler_CounterData_GetSampleTime_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint64_t timestampStart;
+        /// [out]
+        uint64_t timestampEnd;
+    } NVPW_PeriodicSampler_CounterData_GetSampleTime_Params;
+#define NVPW_PeriodicSampler_CounterData_GetSampleTime_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params, timestampEnd)
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetSampleTime(NVPW_PeriodicSampler_CounterData_GetSampleTime_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_TrimInPlace_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out]
+        size_t counterDataImageTrimmedSize;
+    } NVPW_PeriodicSampler_CounterData_TrimInPlace_Params;
+#define NVPW_PeriodicSampler_CounterData_TrimInPlace_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params, counterDataImageTrimmedSize)
+    NVPA_Status NVPW_PeriodicSampler_CounterData_TrimInPlace(NVPW_PeriodicSampler_CounterData_TrimInPlace_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_GetInfo_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [out] total number of ranges in the counter data
+        size_t numTotalRanges;
+        /// [out] if in "linear" mode, this API returns the number of "populated" ranges; if it's in "circular" mode,
+        /// then it returns the last "populated" range index + 1, when there is no such range, it returns 0.
+        size_t numPopulatedRanges;
+        /// [out] if in "linear" mode, this API returns the number of "completed" ranges; if it's in "circular" mode,
+        /// then it returns the last "completed" range index + 1, when there is no such range, it returns 0.
+        size_t numCompletedRanges;
+    } NVPW_PeriodicSampler_CounterData_GetInfo_Params;
+#define NVPW_PeriodicSampler_CounterData_GetInfo_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetInfo_Params, numCompletedRanges)
+    /// In periodic sampler, a range in counter data stores exactly one sample's data. For better performance, periodic
+    /// sampler may operate in an out-of-order fashion when populating sample data, i.e. it may not fully populate all
+    /// counters of a sample/range before starting to populate the next sample/range. As a result, we have two concepts
+    /// here, "populated" & "completed": a range is considered "populated" even if only partial counters have been
+    /// written; on the other hand, a range is only considered "completed" if all the collecting counters have been
+    /// written.
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetInfo(NVPW_PeriodicSampler_CounterData_GetInfo_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        uint32_t triggerCount;
+    } NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params;
+#define NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params, triggerCount)
+    NVPA_Status NVPW_PeriodicSampler_CounterData_GetTriggerCount(NVPW_PeriodicSampler_CounterData_GetTriggerCount_Params* pParams);
+    typedef struct NVPW_PeriodicSampler_CounterData_IsDataComplete_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in]
+        size_t counterDataImageSize;
+        /// [in]
+        size_t rangeIndex;
+        /// [out]
+        NVPA_Bool isComplete;
+    } NVPW_PeriodicSampler_CounterData_IsDataComplete_Params;
+#define NVPW_PeriodicSampler_CounterData_IsDataComplete_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_PeriodicSampler_CounterData_IsDataComplete_Params, isComplete)
+    /// Checks whether a given sample's data is complete. See also 'NVPW_PeriodicSampler_CounterData_GetInfo'
+    NVPA_Status NVPW_PeriodicSampler_CounterData_IsDataComplete(NVPW_PeriodicSampler_CounterData_IsDataComplete_Params* pParams);
+    typedef struct NVPW_TimestampReport
+    {
+        uint32_t payload;
+        uint8_t reserved0004[4];
+        uint64_t timestamp;
+    } NVPW_TimestampReport;
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+#endif // NVPERF_TARGET_H

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_atomic_functions.hpp ADDED Viewed

	@@ -0,0 +1,151 @@

+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_32_ATOMIC_FUNCTIONS_HPP__
+#ifdef __CUDA_ARCH__
+extern "C"
+{
+extern __device__ __device_builtin__ long long __illAtomicMin(long long *address, long long val);
+extern __device__ __device_builtin__ long long __illAtomicMax(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicAnd(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicOr(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicXor(long long *address, long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicMin(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicMax(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicAnd(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicOr (unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicXor(unsigned long long *address, unsigned long long val);
+}
+#endif /* __CUDA_ARCH__ */
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
+{
+    return __illAtomicMin(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
+{
+    return __illAtomicMax(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
+{
+    return __llAtomicAnd(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
+{
+    return __llAtomicOr(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
+{
+    return __llAtomicXor(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMin(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMax(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicAnd(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicOr(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicXor(address, val);
+}
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (7.17 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/math.cpython-312.pyc ADDED Viewed

Binary file (14.6 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/random.cpython-312.pyc ADDED Viewed

Binary file (9.64 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/standard.cpython-312.pyc ADDED Viewed

Binary file (21.8 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/__pycache__/target_info.cpython-312.pyc ADDED Viewed

Binary file (2.28 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import pkgutil
+from importlib.util import module_from_spec
+from sys import modules
+_backends = []
+for module_finder, module_name, is_pkg in pkgutil.iter_modules(
+        __path__,
+        prefix=__name__ + ".",
+):
+    # skip .py files (like libdevice.py)
+    if not is_pkg:
+        continue
+    # import backends (like cuda and hip) that are included during setup.py
+    spec = module_finder.find_spec(module_name)
+    if spec is None or spec.loader is None:
+        continue
+    module = module_from_spec(spec)
+    spec.loader.exec_module(module)
+    _backends.append(module_name)
+    modules[module_name] = module
+__all__ = _backends
+del _backends

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (911 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/__pycache__/libdevice.cpython-312.pyc ADDED Viewed

Binary file (20.4 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from . import libdevice
+from .utils import (globaltimer, num_threads, num_warps, smid, convert_custom_float8_sm70, convert_custom_float8_sm80)
+from .gdc import (gdc_launch_dependents, gdc_wait)
+__all__ = [
+    "libdevice",
+    "globaltimer",
+    "num_threads",
+    "num_warps",
+    "smid",
+    "convert_custom_float8_sm70",
+    "convert_custom_float8_sm80",
+    "gdc_launch_dependents",
+    "gdc_wait",
+]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (585 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/gdc.cpython-312.pyc ADDED Viewed

Binary file (2.74 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/libdevice.cpython-312.pyc ADDED Viewed

Binary file (89.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (5.61 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/gdc.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Grid Dependency Control (GDC) is a mechanism used when enabling programmatic dependent launch to launch and
+synchronize grids. These APIs expose GDC to the programmer.
+Programmatic dependent launch is supported on SM90 (Hopper) and beyond.
+For PTX reference on grid dependency control see https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol.
+"""
+from triton.language import core
+@core.extern
+def gdc_wait(_semantic=None):
+    """
+    GDC wait is a blocking instruction that waits for all instructions in a prior kernel to complete before continuing.
+    This ensures all memory operations happening before the wait is visible to instructions after it,
+    e.g. if the prior kernel writes to address "x" the new values will be visible in this kernel after the wait.
+    This instruction is also safe to execute when programmatic dependent launch is disabled.
+    See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol for more details.
+    """
+    core.inline_asm_elementwise("griddepcontrol.wait; // dummy $0", "=r", [], dtype=core.int32, is_pure=False, pack=1,
+                                _semantic=_semantic)
+@core.extern
+def gdc_launch_dependents(_semantic=None):
+    """
+    This operation when launched with programmatic dependent launch signals that
+    the next program may launch once all programs in the current kernel
+    call this function or complete.
+    Repeated calls to this function have no effect past the first call, and the first call should be
+    treated by the programmer as a hint to the runtime system to launch the next kernel.
+    This instruction is also safe to execute when programmatic dependent launch is disabled.
+    See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol for more details.
+    """
+    core.inline_asm_elementwise("griddepcontrol.launch_dependents; // dummy $0", "=r", [], dtype=core.int32,
+                                is_pure=False, pack=1, _semantic=_semantic)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/libdevice.py ADDED Viewed

	@@ -0,0 +1,1629 @@

+from triton.language import core
+@core.extern
+def clz(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_clz", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_clzll", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def popc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_popc", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_popcll", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def byte_perm(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1, arg2], {
+        (core.dtype("int32"), core.dtype("int32"), core.dtype("int32")): ("__nv_byte_perm", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def mulhi(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_mulhi", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_umulhi", core.dtype("uint32")),
+            (core.dtype("int64"), core.dtype("int64")): ("__nv_mul64hi", core.dtype("int64")),
+            (core.dtype("uint64"), core.dtype("uint64")): ("__nv_umul64hi", core.dtype("uint64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def mul24(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_mul24", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_umul24", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def brev(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_brev", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_brevll", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sad(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("int32"), core.dtype("int32"), core.dtype("uint32")): ("__nv_sad", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32"), core.dtype("uint32")): ("__nv_usad", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def abs(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("int32"), ): ("__nv_abs", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_llabs", core.dtype("int64")),
+            (core.dtype("fp32"), ): ("__nv_fabsf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_fabs", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def floor(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_floorf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_floor", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rcp64h(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_rcp64h", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rsqrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_rsqrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_rsqrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ceil(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp64"), ): ("__nv_ceil", core.dtype("fp64")),
+            (core.dtype("fp32"), ): ("__nv_ceilf", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def trunc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp64"), ): ("__nv_trunc", core.dtype("fp64")),
+            (core.dtype("fp32"), ): ("__nv_truncf", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def exp2(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_exp2f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_exp2", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def saturatef(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_saturatef", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fma_rn(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fma_rz(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fma_rd(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fma_ru(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_dividef(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fast_fdividef", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def div_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def div_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def div_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def div_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdiv_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_ddiv_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rcp_rn(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rcp_rz(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rcp_rd(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rcp_ru(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_frcp_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_drcp_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sqrt_rn(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sqrt_rz(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sqrt_rd(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sqrt_ru(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_fsqrt_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_dsqrt_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sqrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sqrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sqrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def add_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_rn", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_rn", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def add_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_rz", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_rz", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def add_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_rd", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_rd", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def add_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dadd_ru", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fadd_ru", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def mul_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dmul_rn", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmul_rn", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def mul_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dmul_rz", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmul_rz", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def mul_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dmul_rd", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmul_rd", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def mul_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+            arg1,
+        ], {
+            (
+                core.dtype("fp64"),
+                core.dtype("fp64"),
+            ): ("__nv_dmul_ru", core.dtype("fp64")),
+            (
+                core.dtype("fp32"),
+                core.dtype("fp32"),
+            ): ("__nv_fmul_ru", core.dtype("fp32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2int_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2int_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2int_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2int_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2int_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2uint_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2uint_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2uint_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2uint_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2uint_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def int2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def uint2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2int_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2int_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2int_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2int_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2int_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2uint_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_rn", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2uint_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_rz", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2uint_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_rd", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2uint_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2uint_ru", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def int2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def int2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def int2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def int2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def uint2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def uint2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def uint2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def uint2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def hiloint2double(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("int32"), core.dtype("int32")): ("__nv_hiloint2double", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2loint(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2loint", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2hiint(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2hiint", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ll_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ll_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ll_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ll_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ll_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ull_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ull_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ull_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float2ull_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float2ull_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ll_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ll_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ll_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ll_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ll_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ull_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_rn", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ull_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_rz", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ull_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_rd", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double2ull_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double2ull_ru", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2float_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2float_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_rz", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2float_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_rd", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2float_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2float_ru", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2double_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_rz", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2double_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_rd", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ll2double_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_ll2double_ru", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2double_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_rn", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2double_rz(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_rz", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2double_rd(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_rd", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ull2double_ru(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint64"), ): ("__nv_ull2double_ru", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def int_as_float(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int32"), ): ("__nv_int_as_float", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float_as_int(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float_as_int", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def uint_as_float(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("uint32"), ): ("__nv_uint_as_float", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def float_as_uint(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_float_as_uint", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def longlong_as_double(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("int64"), ): ("__nv_longlong_as_double", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def double_as_longlong(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_double_as_longlong", core.dtype("int64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_sinf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_sinf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_cosf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_cosf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_log2f(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_log2f", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_logf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_logf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_expf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_expf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_tanf(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_tanf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_exp10f(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_exp10f", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_log10f(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_fast_log10f", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fast_powf(arg0, arg1, _semantic=None):
+    return core.extern_elementwise("", "", [arg0, arg1], {
+        (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fast_powf", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def hadd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_hadd", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_uhadd", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rhadd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("int32")): ("__nv_rhadd", core.dtype("int32")),
+            (core.dtype("uint32"), core.dtype("uint32")): ("__nv_urhadd", core.dtype("uint32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sub_rn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_rn", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_rn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sub_rz(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_rz", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_rz", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sub_rd(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_rd", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_rd", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sub_ru(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fsub_ru", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_dsub_ru", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rsqrt_rn(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [
+        arg0,
+    ], {
+        (core.dtype("fp32"), ): ("__nv_frsqrt_rn", core.dtype("fp32")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ffs(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("int32"), ): ("__nv_ffs", core.dtype("int32")),
+            (core.dtype("int64"), ): ("__nv_ffsll", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_rintf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_rint", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def llrint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_llrintf", core.dtype("int64")),
+            (core.dtype("fp64"), ): ("__nv_llrint", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def nearbyint(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_nearbyintf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_nearbyint", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def isnan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_isnanf", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_isnand", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+@core.extern
+def signbit(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [
+            arg0,
+        ], {
+            (core.dtype("fp32"), ): ("__nv_signbitf", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_signbitd", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def copysign(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_copysignf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_copysign", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def finitef(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_finitef", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+@core.extern
+def isinf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_isinff", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_isinfd", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)
+@core.extern
+def nextafter(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_nextafterf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_nextafter", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sin(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sinf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sin", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def cos(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cosf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cos", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sinpi(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sinpif", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sinpi", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def cospi(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cospif", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cospi", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def tan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_tanf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_tan", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def log2(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_log2f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log2", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def exp(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_expf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_exp", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def exp10(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_exp10f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_exp10", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def cosh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_coshf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cosh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def sinh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_sinhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_sinh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def tanh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_tanhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_tanh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def atan2(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_atan2f", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_atan2", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def atan(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_atanf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_atan", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def asin(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_asinf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_asin", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def acos(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_acosf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_acos", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def log(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_logf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def log10(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_log10f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log10", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def log1p(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_log1pf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_log1p", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def acosh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_acoshf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_acosh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def asinh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_asinhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_asinh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def atanh(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_atanhf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_atanh", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def expm1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_expm1f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_expm1", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def hypot(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_hypotf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_hypot", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rhypot(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_rhypotf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_rhypot", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def norm3d(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_norm3df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_norm3d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rnorm3d(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_rnorm3df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_rnorm3d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def norm4d(arg0, arg1, arg2, arg3, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2, arg3], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")):
+            ("__nv_norm4df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")):
+            ("__nv_norm4d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rnorm4d(arg0, arg1, arg2, arg3, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2, arg3], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")):
+            ("__nv_rnorm4df", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")):
+            ("__nv_rnorm4d", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def cbrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cbrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cbrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def rcbrt(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_rcbrtf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_rcbrt", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def j0(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_j0f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_j0", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def j1(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_j1f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_j1", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def y0(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_y0f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_y0", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def y1(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp32"), ): ("__nv_y1f", core.dtype("fp32")),
+        (core.dtype("fp64"), ): ("__nv_y1", core.dtype("fp64")),
+    }, is_pure=True, _semantic=_semantic)
+@core.extern
+def yn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("fp32")): ("__nv_ynf", core.dtype("fp32")),
+            (core.dtype("int32"), core.dtype("fp64")): ("__nv_yn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def jn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("int32"), core.dtype("fp32")): ("__nv_jnf", core.dtype("fp32")),
+            (core.dtype("int32"), core.dtype("fp64")): ("__nv_jn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def cyl_bessel_i0(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cyl_bessel_i0f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cyl_bessel_i0", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def cyl_bessel_i1(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_cyl_bessel_i1f", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_cyl_bessel_i1", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def erf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erff", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erf", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def erfinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfinvf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfinv", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def erfc(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfcf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfc", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def erfcx(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfcxf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfcx", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def erfcinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_erfcinvf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_erfcinv", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def normcdfinv(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_normcdfinvf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_normcdfinv", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def normcdf(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_normcdff", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_normcdf", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def lgamma(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_lgammaf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_lgamma", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ldexp(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__nv_ldexpf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__nv_ldexp", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def scalbn(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__nv_scalbnf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__nv_scalbn", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fmod(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmodf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_fmod", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def remainder(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_remainderf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_remainder", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fma(arg0, arg1, arg2, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1, arg2], {
+            (core.dtype("fp32"), core.dtype("fp32"), core.dtype("fp32")): ("__nv_fmaf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64"), core.dtype("fp64")): ("__nv_fma", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def pow(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("int32")): ("__nv_powif", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("int32")): ("__nv_powi", core.dtype("fp64")),
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_powf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_pow", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def tgamma(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_tgammaf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_tgamma", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def round(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_roundf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_round", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def llround(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_llroundf", core.dtype("int64")),
+            (core.dtype("fp64"), ): ("__nv_llround", core.dtype("int64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def fdim(arg0, arg1, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0, arg1], {
+            (core.dtype("fp32"), core.dtype("fp32")): ("__nv_fdimf", core.dtype("fp32")),
+            (core.dtype("fp64"), core.dtype("fp64")): ("__nv_fdim", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def ilogb(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_ilogbf", core.dtype("int32")),
+            (core.dtype("fp64"), ): ("__nv_ilogb", core.dtype("int32")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def logb(arg0, _semantic=None):
+    return core.extern_elementwise(
+        "", "", [arg0], {
+            (core.dtype("fp32"), ): ("__nv_logbf", core.dtype("fp32")),
+            (core.dtype("fp64"), ): ("__nv_logb", core.dtype("fp64")),
+        }, is_pure=True, _semantic=_semantic)
+@core.extern
+def isfinited(arg0, _semantic=None):
+    return core.extern_elementwise("", "", [arg0], {
+        (core.dtype("fp64"), ): ("__nv_isfinited", core.dtype("int32")),
+    }, is_pure=True, _semantic=_semantic).to(core.int1, _semantic=_semantic)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/cuda/utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from triton.language import core
+@core.extern
+def globaltimer(_semantic=None):
+    return core.inline_asm_elementwise("mov.u64 $0, %globaltimer;", "=l", [], dtype=core.int64, is_pure=False, pack=1,
+                                       _semantic=_semantic)
+@core.extern
+def smid(_semantic=None):
+    return core.inline_asm_elementwise("mov.u32 $0, %smid;", "=r", [], dtype=core.int32, is_pure=True, pack=1,
+                                       _semantic=_semantic)
+@core.builtin
+def num_threads(_semantic=None):
+    return core.constexpr(_semantic.builder.options.num_warps * 32)
+@core.builtin
+def num_warps(_semantic=None):
+    return core.constexpr(_semantic.builder.options.num_warps)
+# ----- FP8E4M3B15 ------
+# This data-type is a variant of the standard FP8E4M3 format.
+# It was designed for fast software conversion to FP16 on
+# nvidia GPUs that do not support it natively.
+# This is the same format as FP8E4M3Nv, but:
+#   - the exponent bias is 15 instead of 7
+#   - 0xff and 0x7f are mapped to +-1.750 instead of +-nan
+@core.builtin
+def convert_fp8e4b15_to_float16(arg, _semantic=None):
+    return core.inline_asm_elementwise(
+        "{                                      \n"
+        ".reg .b32 a<2>, b<2>;                  \n"
+        "prmt.b32 a0, 0, $2, 0x5746;            \n"
+        "and.b32 b0, a0, 0x7f007f00;            \n"
+        "and.b32 b1, a0, 0x00ff00ff;            \n"
+        "and.b32 a1, a0, 0x00800080;            \n"
+        "shr.b32  b0, b0, 1;                    \n"
+        "add.u32 b1, b1, a1;                    \n"
+        "lop3.b32 $0, b0, 0x80008000, a0, 0xf8; \n"
+        "shl.b32 $1, b1, 7;                     \n"
+        "}                                      \n", "=r,=r,r", [arg], dtype=core.float16, is_pure=True, pack=4,
+        _semantic=_semantic)
+@core.builtin
+def convert_float16_to_fp8e4b15(arg, has_minx2, _semantic=None):
+    asm = """{
+            .reg .pred p<4>;
+            .reg .b32 a<2>, b<2>;
+            .reg .b16 c<4>;
+            .reg .b16 max_val_f16;
+            .reg .b32 max_val_f16x2;
+            mov.b16 max_val_f16,   0x3F00;
+            mov.b32 max_val_f16x2, 0x3F003F00;
+            and.b32 a0, $1, 0x7fff7fff;
+            and.b32 a1, $2, 0x7fff7fff;"""
+    if has_minx2:
+        asm += """min.f16x2 a0, a0, max_val_f16x2;
+                  min.f16x2 a1, a1, max_val_f16x2;"""
+    else:
+        asm += """setp.lt.f16x2  p0|p1, a0, max_val_f16x2;
+                  setp.lt.f16x2  p2|p3, a1, max_val_f16x2;
+                  mov.b32 {c0, c1}, a0;
+                  mov.b32 {c2, c3}, a1;
+                  selp.b16  c0, c0, max_val_f16, p0;
+                  selp.b16  c1, c1, max_val_f16, p1;
+                  selp.b16  c2, c2, max_val_f16, p2;
+                  selp.b16  c3, c3, max_val_f16, p3;
+                  mov.b32 a0, {c0, c1};
+                  mov.b32 a1, {c2, c3};"""
+    asm += """mad.lo.u32 a0, a0, 2, 0x00800080;
+              mad.lo.u32 a1, a1, 2, 0x00800080;
+              lop3.b32 b0, $1, 0x80008000, a0, 0xea;
+              lop3.b32 b1, $2, 0x80008000, a1, 0xea;
+              prmt.b32 $0, b0, b1, 0x7531;
+              }"""
+    return core.inline_asm_elementwise(asm, "=r,r,r", [arg], dtype=core.float8e4b15, is_pure=True, pack=4,
+                                       _semantic=_semantic)
+@core.builtin
+def convert_custom_float8(arg, dst_ty, fp_downcast_rounding, has_minx2, _semantic=None):
+    if arg.type.scalar.is_fp8e4b15():
+        upcast_val = convert_fp8e4b15_to_float16(arg, _semantic=_semantic)
+        if dst_ty.scalar.is_fp32():
+            upcast_val = upcast_val.to(core.float32, _semantic=_semantic)
+        return upcast_val
+    assert arg.type.scalar.is_fp16() or arg.type.scalar.is_fp32()
+    downcast_val = arg
+    if arg.type.scalar.is_fp32():
+        downcast_val = downcast_val.to(core.float16, fp_downcast_rounding="rtz", _semantic=_semantic)
+    downcast_val = convert_float16_to_fp8e4b15(downcast_val, has_minx2=has_minx2, _semantic=_semantic)
+    return downcast_val
+@core.builtin
+def convert_custom_float8_sm80(arg, dst_ty, fp_downcast_rounding=None, _semantic=None):
+    return convert_custom_float8(arg, dst_ty, fp_downcast_rounding, has_minx2=True, _semantic=_semantic)
+@core.builtin
+def convert_custom_float8_sm70(arg, dst_ty, fp_downcast_rounding=None, _semantic=None):
+    return convert_custom_float8(arg, dst_ty, fp_downcast_rounding, has_minx2=False, _semantic=_semantic)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/hip/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from . import libdevice
+from .utils import memrealtime
+__all__ = ["libdevice", "memrealtime"]

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/language/extra/hip/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (347 Bytes). View file