lsmpp commited on Oct 16, 2025

Commit

d57e24e

verified ·

1 Parent(s): 642ad6a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +248 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py +22 -0
.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py +42 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h +68 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h +669 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h +669 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h +60 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h +60 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h +693 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h +693 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h +992 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h +992 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h +1316 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h +1316 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h +68 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h +70 -0
.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h +70 -0
.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt +39 -0
.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py +54 -0
.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so +0 -0
.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx +2 -0
.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build +6 -0
.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc +0 -0
.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py +0 -0
.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py +62 -0
.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py +16 -0
.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py +33 -0
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd +101 -0
.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp +1505 -0
.venv/lib/python3.12/site-packages/sklearn/_loss/link.py +282 -0
.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py +1181 -0
.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build +23 -0

.gitattributes CHANGED Viewed

@@ -809,3 +809,251 @@ illustrious_generated/3e2afaad2b7d.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/04d6bfa98264.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/62a8fa0ac7dd.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/d190d03f64a7.png filter=lfs diff=lfs merge=lfs -text

 illustrious_generated/04d6bfa98264.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/62a8fa0ac7dd.png filter=lfs diff=lfs merge=lfs -text
 illustrious_generated/d190d03f64a7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f6342e8db68a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f7ca451e1933.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6b3c44df8332.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ed13e74032fb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/faa1e7049117.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c17212cc7fda.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6c268f463a2b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a364591ba4c1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2ea3ba7918b4.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2ffb09f5cbc0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0d55065059c0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/85e9723ae8cf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e89ab638d462.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/224c2084abb8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0b77d88bc5f0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/91076903bce5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7acda55248bc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ee32c9618a12.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/698a4bf05f13.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bf97f1eaffeb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/62daa562132c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9ee7e057c8a2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/427d956c743b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/06da7f820423.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/92bcab0aaba1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/502a84449b45.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d99abaed93ba.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3a12bf82c05e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/433a115b55a3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/574012fe8664.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7d22dc2a6fb2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4f23c350b644.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e24085ea542f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3cc7f3366f7a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5242430c6777.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6fe5f96649a3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/12875eda15eb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/eac29190186c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c1276a9fc21b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a891e5d92031.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0367ba694b76.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f84f116882be.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b8e81c1a4bd1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/392a7a129a01.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1506e01a5598.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cbd5827b38ea.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b80b59fe722f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a2ca03055273.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b58cf17494db.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c587778617b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7c5200560049.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b78d0c1f0687.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5c6f22f08540.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9b2b12c21a2b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ec96a311c2cb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a28e4715fc8c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/00f5e16a2236.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0ef8c1ed2c6c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f214facc5681.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f41b4fc2c7d5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9e9a0ce3d676.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/26d2ef2d7d03.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1e774fcc188d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7eab3f4f0c8e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f8631de95d70.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8d95e57fcb27.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7ac791baad53.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7b8529c066a0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7d8509931e4e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9fafd1175b72.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7023242de1c0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/99d5b088ccd4.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2bac6ab4413e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/00ff6449b55d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7b900f6e27b1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/69e10254baf5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/93d9e9abc98e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/095dc81d1160.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3315198d28df.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2549abad7eff.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8a90db3476ef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/72473c769552.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bbf3fb096202.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c5e0eb8a2241.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8fa96985fc06.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/645e3b996530.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b9fdc64b985c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fa67e15ca2bf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9f5c49f2e362.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e8318516b273.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e801a5ce2da6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cd9145683d1e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/275253c8ad6b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f2a6e0c5c432.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/586dbda7c6ff.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dff506d177c0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c8846919f3a8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/afbdb8dce1e5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fd4c46f2141f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ee36cea22c91.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6ca60a86b836.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/11c7f55b2aab.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d684bc0d0627.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4f1602c01d5b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/45c709323899.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d7bc7c5ba632.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0e0acc59ef85.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1c7a7ed6f359.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/31cbd66704bb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dd8a48931525.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7368d4c82b5f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c7e1a60c0f5d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/be56d67f1e08.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/269ee6e9a79c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2bb0e99b92bc.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/afd28993674d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/585afc2017e2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f9c5bdc8bef5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8f338d47820a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e0443895d658.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/67ea9c16fed3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/78dfdb4f0521.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fff7c0390e8a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c63799030196.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fc061ac787c7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/26185801988b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/656abae8d0b6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5c4a2ea8f842.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2286bf835a6b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dc7501a6f47f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/38b5363061d5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/451e48977b1a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f7621703575c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/891dc839571c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d1e30fd687b5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d1413371999b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0ad3307ea09c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6fba429dafc5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/481f3834876a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1e54c0c78134.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a564e408f362.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ec6650b62802.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9f447e4cf3d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/790ece21df10.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/75e576f27cb6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/205b715d279f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/060e926dcc0a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/733c86338921.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b9f37572031b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/43eeb1fb403b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d22ef7243fac.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/162e3face5a7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/765bf9d23c7e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/47418c15a58f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3030bee9df5a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e4acb93d313c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/08e454ab01c2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3f43e650c7d7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/085929212457.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/91d346543b7c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/891abd7c9fa3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/1927adcb399a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7e49e6b5a30b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/2cd36314054f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b569d3590c66.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9e8dc59217e8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c2c3bea0e9d5.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/05972b153525.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c9bf921e364a.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/13cdedc9c525.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d8641bfcdd46.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/34afbd2725c8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f0d97f98333f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/76b2de1037cb.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a370eb471cd7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f5ab32c63fb8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5718f8172842.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b7f508ecce88.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5f147d77f3ed.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ac9d950baac7.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8b674edb3a4e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8ad0a744de62.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5b8f74bcc260.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/78026f131004.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d305fe437c6f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7cce990ade4c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c76729f0f827.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/0706f94ebdc3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/22af9def0424.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/43877698ad33.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/5a0201bebc6d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7ad096e9b528.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/46edb49b5dbf.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bd65b176bfe6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/073f299a3b06.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fc885c9be9af.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bcfc32b88c98.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e55e6cf94025.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/b4a9600f3647.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/d7ef34bf47ee.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8cbc6e1dbe62.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8633a3dff7ea.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cb335826ba02.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3048ba382498.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/eca43ddadd85.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/365e7d0f97c2.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e71b25950c5d.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/59a595c825c8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/82ee8177ef04.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/36915299353b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ca07713b354c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/fbebd175667e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/dacfbbcd3fb3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8a371dac467c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/40c498965cbd.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/190beb9306ef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/bb2041beb345.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6f1c05af41ca.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9f741bd68919.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9bb815cccb98.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/41d42d8f4842.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/13166cbea867.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e2812aff73e9.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/954594f7f0a6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/c4b5bff2dbc1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/980b174e831c.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/ed89a47fd589.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a8e5c9011eef.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/f1de13ffcad6.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/574fba2c6515.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/591e156ad5fd.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/aef907db00ce.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3967f8d787ab.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/a1ec0d3b0b0e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/9da135f5f21e.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/8fd9fbffb954.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/24e5b9fe7d38.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/495f1b55919f.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/19ff2ce2a961.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/e39fecdd2676.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/7663094bacec.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6d5feb7de870.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/abe90752beb0.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/cae43d7fd0f8.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/3f5c59c8ee7b.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/49712a2e71f1.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/6346f39915f3.png filter=lfs diff=lfs merge=lfs -text
+illustrious_generated/4c6ea9681419.png filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (56.9 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_adapters.cpython-312.pyc ADDED Viewed

Binary file (5.93 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_collections.cpython-312.pyc ADDED Viewed

Binary file (1.98 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_compat.cpython-312.pyc ADDED Viewed

Binary file (2.26 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_functools.cpython-312.pyc ADDED Viewed

Binary file (3.49 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_itertools.cpython-312.pyc ADDED Viewed

Binary file (6.49 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_meta.cpython-312.pyc ADDED Viewed

Binary file (3.58 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_text.cpython-312.pyc ADDED Viewed

Binary file (3.89 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/__pycache__/_typing.cpython-312.pyc ADDED Viewed

Binary file (399 Bytes). View file

.venv/lib/python3.12/site-packages/importlib_metadata/compat/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (202 Bytes). View file

.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py311.cpython-312.pyc ADDED Viewed

Binary file (1.27 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/compat/__pycache__/py39.cpython-312.pyc ADDED Viewed

Binary file (1.71 kB). View file

.venv/lib/python3.12/site-packages/importlib_metadata/compat/py311.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+import pathlib
+import sys
+import types
+def wrap(path):  # pragma: no cover
+    """
+    Workaround for https://github.com/python/cpython/issues/84538
+    to add backward compatibility for walk_up=True.
+    An example affected package is dask-labextension, which uses
+    jupyter-packaging to install JupyterLab javascript files outside
+    of site-packages.
+    """
+    def relative_to(root, *, walk_up=False):
+        return pathlib.Path(os.path.relpath(path, root))
+    return types.SimpleNamespace(relative_to=relative_to)
+relative_fix = wrap if sys.version_info < (3, 12) else lambda x: x

.venv/lib/python3.12/site-packages/importlib_metadata/compat/py39.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Compatibility layer with Python 3.8/3.9
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:  # pragma: no cover
+    # Prevent circular imports on runtime.
+    from .. import Distribution, EntryPoint
+else:
+    Distribution = EntryPoint = Any
+from .._typing import md_none
+def normalized_name(dist: Distribution) -> str | None:
+    """
+    Honor name normalization for distributions that don't provide ``_normalized_name``.
+    """
+    try:
+        return dist._normalized_name
+    except AttributeError:
+        from .. import Prepared  # -> delay to prevent circular imports.
+        return Prepared.normalize(
+            getattr(dist, "name", None) or md_none(dist.metadata)['Name']
+        )
+def ep_matches(ep: EntryPoint, **params) -> bool:
+    """
+    Workaround for ``EntryPoint`` objects without the ``matches`` method.
+    """
+    try:
+        return ep.matches(**params)
+    except AttributeError:
+        from .. import EntryPoint  # -> delay to prevent circular imports.
+        # Reconstruct the EntryPoint object to make sure it is compatible.
+        return EntryPoint(ep.name, ep.value, ep.group).matches(**params)

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn.h ADDED Viewed

	@@ -0,0 +1,68 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn : Neural Networks Library  */
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#include <cuda_runtime_api.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+#include "cudnn_ops.h"
+#include "cudnn_adv.h"
+#include "cudnn_cnn.h"
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv.h ADDED Viewed

	@@ -0,0 +1,669 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn_adv : cuDNN's advanced and experimental features.
+*/
+#if !defined(CUDNN_ADV_H_)
+#define CUDNN_ADV_H_
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_MAJOR 9
+#define CUDNN_ADV_MINOR 10
+#define CUDNN_ADV_PATCH 2
+#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* BASIC RNN API */
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+/* For auxFlags in cudnnSetRNNDescriptor_v8() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
+ * Compute precision is further modified by mathType that sets the
+ * preferred option for using NVIDIA Tensor Cores.  dataType specify
+ * input/output data type and weight/bias type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fwdMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+/* Sequence data descriptor */
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+/* Multihead Attention */
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvVersionCheck(void);
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+/*
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+ */
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnCTCGradMode_t ctcGradMode,
+                             int maxLabelLength);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnCTCGradMode_t *ctcGradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                  /* labels, in CPU memory */
+    const int hostLabelLengths[],            /* the length of each label, in CPU memory */
+    const int hostInputLengths[],            /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int labels[],                      /* labels, in GPU memory */
+    const int labelLengths[],                /* the length of each label, in GPU memory */
+    const int inputLengths[],                /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_ADV_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h ADDED Viewed

	@@ -0,0 +1,669 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn_adv : cuDNN's advanced and experimental features.
+*/
+#if !defined(CUDNN_ADV_H_)
+#define CUDNN_ADV_H_
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_MAJOR 9
+#define CUDNN_ADV_MINOR 10
+#define CUDNN_ADV_PATCH 2
+#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* BASIC RNN API */
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+/* For auxFlags in cudnnSetRNNDescriptor_v8() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
+ * Compute precision is further modified by mathType that sets the
+ * preferred option for using NVIDIA Tensor Cores.  dataType specify
+ * input/output data type and weight/bias type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fwdMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+/* Sequence data descriptor */
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+/* Multihead Attention */
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvVersionCheck(void);
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+/*
+ * CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+ */
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnCTCGradMode_t ctcGradMode,
+                             int maxLabelLength);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnCTCGradMode_t *ctcGradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                  /* labels, in CPU memory */
+    const int hostLabelLengths[],            /* the length of each label, in CPU memory */
+    const int hostInputLengths[],            /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size)  */
+    const void *probs,                       /* probabilities after softmax, in GPU memory */
+    const int labels[],                      /* labels, in GPU memory */
+    const int labelLengths[],                /* the length of each label, in GPU memory */
+    const int inputLengths[],                /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_ADV_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend.h ADDED Viewed

	@@ -0,0 +1,60 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+/*
+ * The content of this header has been moved into cudnn_graph.h.
+ * This header is kept for the backward compatibility purpose.
+ */
+#include "cudnn_graph.h"
+#endif /* _CUDNN_BACKEND_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h ADDED Viewed

	@@ -0,0 +1,60 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+/*
+ * The content of this header has been moved into cudnn_graph.h.
+ * This header is kept for the backward compatibility purpose.
+ */
+#include "cudnn_graph.h"
+#endif /* _CUDNN_BACKEND_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn.h ADDED Viewed

	@@ -0,0 +1,693 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_cnn : cuDNN's basic definitions and CNN functions.
+ */
+#if !defined(CUDNN_CNN_H_)
+#define CUDNN_CNN_H_
+#pragma once
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_MAJOR 9
+#define CUDNN_CNN_MINOR 10
+#define CUDNN_CNN_PATCH 2
+#if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnConvolutionFwdAlgoPerfStruct {
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED;
+/* Create an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
+/* Destroy an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType); /* convolution data type */
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType); /* convolution data type */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w);
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]);
+/* helper function to provide the convolution forward algo that fit best the requirement */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData);
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes);
+/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform the forward pass for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y);
+/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y);
+/* helper function to provide the convolution backward data algo that fit best the requirement */
+typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    cudnnConvolutionBwdDataAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+/* Helper function to calculate folding descriptors for dgrad */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
+/* cudnnFusedOps... */
+struct cudnnFusedOpsConstParamStruct;
+typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED;
+struct cudnnFusedOpsVariantParamStruct;
+typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED;
+struct cudnnFusedOpsPlanStruct;
+typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED;
+typedef enum {
+    /* each op in [ ] can be disabled by passing NULL ptr */
+    /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
+    /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
+    /* utility for BN training in BN-conv fusion */
+    /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
+    /* optionally update running stats and generate saved stats */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
+    /* utility for BN inference in BN-conv fusion */
+    /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
+    /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
+    CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
+    /* reserved for future use: [per channel scale], [per channel bias], [residual add],  activation, bitmask */
+    CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
+    /* reserved for future use */
+    CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
+} cudnnFusedOps_t CUDNN_DEPRECATED;
+typedef enum {
+    /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get XDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_XDESC = 0,
+    /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
+    /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
+    CUDNN_PARAM_BN_MODE = 2,
+    /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
+    /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
+    /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
+    /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
+    /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_DESC = 6,
+    /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
+    /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
+    CUDNN_PARAM_CONV_DESC = 7,
+    /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get WDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_WDESC = 8,
+    /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
+    /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_DWDESC = 10,
+    /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
+    /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YDESC = 12,
+    /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
+    /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DYDESC = 14,
+    /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
+    /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YSTATS_DESC = 16,
+    /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
+    /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
+    /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
+    /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
+    /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
+    /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
+    /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
+    /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
+    /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
+    /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ZDESC = 26,
+    /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
+    /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
+    /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
+    /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
+    /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
+    /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
+    /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DXDESC = 33,
+    /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
+    /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DZDESC = 35,
+    /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
+    /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
+    /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
+} cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_PTR_NULL         = 0,
+    CUDNN_PTR_ELEM_ALIGNED = 1,
+    CUDNN_PTR_16B_ALIGNED  = 2,
+} cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED;
+typedef enum {
+    /* set: pass void* pointing to dev memory */
+    /* get: pass void** pointing to host memory */
+    CUDNN_PTR_XDATA              = 0,
+    CUDNN_PTR_BN_EQSCALE         = 1,
+    CUDNN_PTR_BN_EQBIAS          = 2,
+    CUDNN_PTR_WDATA              = 3,
+    CUDNN_PTR_DWDATA             = 4,
+    CUDNN_PTR_YDATA              = 5,
+    CUDNN_PTR_DYDATA             = 6,
+    CUDNN_PTR_YSUM               = 7,
+    CUDNN_PTR_YSQSUM             = 8,
+    CUDNN_PTR_WORKSPACE          = 9,
+    CUDNN_PTR_BN_SCALE           = 10,
+    CUDNN_PTR_BN_BIAS            = 11,
+    CUDNN_PTR_BN_SAVED_MEAN      = 12,
+    CUDNN_PTR_BN_SAVED_INVSTD    = 13,
+    CUDNN_PTR_BN_RUNNING_MEAN    = 14,
+    CUDNN_PTR_BN_RUNNING_VAR     = 15,
+    CUDNN_PTR_ZDATA              = 16,
+    CUDNN_PTR_BN_Z_EQSCALE       = 17,
+    CUDNN_PTR_BN_Z_EQBIAS        = 18,
+    CUDNN_PTR_ACTIVATION_BITMASK = 19,
+    CUDNN_PTR_DXDATA             = 20,
+    CUDNN_PTR_DZDATA             = 21,
+    CUDNN_PTR_BN_DSCALE          = 22,
+    CUDNN_PTR_BN_DBIAS           = 23,
+    /* set/get: pass size_t* pointing to host memory */
+    CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
+    /* set/get: pass int64_t* pointing to host memory */
+    CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
+} cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED;
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnVersionCheck(void);
+/* helper function to provide the convolution backward filter algo that fit best the requirement */
+typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw);
+/* Function to compute the bias gradient for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_CNN_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_cnn_v9.h ADDED Viewed

	@@ -0,0 +1,693 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_cnn : cuDNN's basic definitions and CNN functions.
+ */
+#if !defined(CUDNN_CNN_H_)
+#define CUDNN_CNN_H_
+#pragma once
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_CNN_MAJOR 9
+#define CUDNN_CNN_MINOR 10
+#define CUDNN_CNN_PATCH 2
+#if (CUDNN_CNN_MAJOR != CUDNN_MAJOR) || (CUDNN_CNN_MINOR != CUDNN_MINOR) || (CUDNN_CNN_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN CNN INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+typedef struct cudnnConvolutionStruct *cudnnConvolutionDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnConvolutionFwdAlgoPerfStruct {
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionFwdAlgoPerf_t CUDNN_DEPRECATED;
+/* Create an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc);
+/* Destroy an instance of convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType); /* convolution data type */
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType); /* convolution data type */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w);
+/* Helper function to return the dimensions of the output tensor given a convolution descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]);
+/* helper function to provide the convolution forward algo that fit best the requirement */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData);
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes);
+/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform the forward pass for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y);
+/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y);
+/* helper function to provide the convolution backward data algo that fit best the requirement */
+typedef struct cudnnConvolutionBwdDataAlgoPerfStruct {
+    cudnnConvolutionBwdDataAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdDataAlgoPerf_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults);
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+/* Helper function to calculate folding descriptors for dgrad */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc);
+/* cudnnFusedOps... */
+struct cudnnFusedOpsConstParamStruct;
+typedef struct cudnnFusedOpsConstParamStruct *cudnnFusedOpsConstParamPack_t CUDNN_DEPRECATED;
+struct cudnnFusedOpsVariantParamStruct;
+typedef struct cudnnFusedOpsVariantParamStruct *cudnnFusedOpsVariantParamPack_t CUDNN_DEPRECATED;
+struct cudnnFusedOpsPlanStruct;
+typedef struct cudnnFusedOpsPlanStruct *cudnnFusedOpsPlan_t CUDNN_DEPRECATED;
+typedef enum {
+    /* each op in [ ] can be disabled by passing NULL ptr */
+    /* [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0,
+    /* [per channel scale], [per channel bias], [activation], convolutionBackwardWeights */
+    CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1,
+    /* utility for BN training in BN-conv fusion */
+    /* computes the equivalent scale and bias from ySum ySqSum and learned scale, bias */
+    /* optionally update running stats and generate saved stats */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2,
+    /* utility for BN inference in BN-conv fusion */
+    /* computes the equivalent scale and bias from learned running stats and learned scale, bias */
+    CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3,
+    /* reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] */
+    CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4,
+    /* reserved for future use: [per channel scale], [per channel bias], [residual add],  activation, bitmask */
+    CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5,
+    /* reserved for future use */
+    CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6,
+} cudnnFusedOps_t CUDNN_DEPRECATED;
+typedef enum {
+    /* set XDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get XDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_XDESC = 0,
+    /* set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_XDATA_PLACEHOLDER = 1,
+    /* set/get BN_MODE: pass cudnnBatchNormMode_t* */
+    CUDNN_PARAM_BN_MODE = 2,
+    /* set CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3,
+    /* set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4,
+    /* set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5,
+    /* set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t */
+    /* get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_DESC = 6,
+    /* set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t */
+    /* get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t */
+    CUDNN_PARAM_CONV_DESC = 7,
+    /* set WDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get WDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_WDESC = 8,
+    /* set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_WDATA_PLACEHOLDER = 9,
+    /* set DWDESC: pass previously initialized cudnnFilterDescriptor_t */
+    /* get DWDESC: pass previously created cudnnFilterDescriptor_t */
+    CUDNN_PARAM_DWDESC = 10,
+    /* set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DWDATA_PLACEHOLDER = 11,
+    /* set YDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YDESC = 12,
+    /* set/get YDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YDATA_PLACEHOLDER = 13,
+    /* set DYDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DYDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DYDESC = 14,
+    /* set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DYDATA_PLACEHOLDER = 15,
+    /* set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_YSTATS_DESC = 16,
+    /* set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSUM_PLACEHOLDER = 17,
+    /* set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18,
+    /* set CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19,
+    /* set/get CUDNN_PARAM_BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20,
+    /* set/get CUDNN_PARAM_BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21,
+    /* set/get CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22,
+    /* set/get CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23,
+    /* set/get CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24,
+    /* set/get CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25,
+    /* set ZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ZDESC = 26,
+    /* set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ZDATA_PLACEHOLDER = 27,
+    /* set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28,
+    /* set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29,
+    /* set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30,
+    /* set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31,
+    /* set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32,
+    /* set DXDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DXDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DXDESC = 33,
+    /* set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DXDATA_PLACEHOLDER = 34,
+    /* set DZDESC: pass previously initialized cudnnTensorDescriptor_t */
+    /* get DZDESC: pass previously created cudnnTensorDescriptor_t */
+    CUDNN_PARAM_DZDESC = 35,
+    /* set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_DZDATA_PLACEHOLDER = 36,
+    /* set/get CUDNN_PARAM_BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37,
+    /* set/get CUDNN_PARAM_BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* */
+    CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38,
+} cudnnFusedOpsConstParamLabel_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_PTR_NULL         = 0,
+    CUDNN_PTR_ELEM_ALIGNED = 1,
+    CUDNN_PTR_16B_ALIGNED  = 2,
+} cudnnFusedOpsPointerPlaceHolder_t CUDNN_DEPRECATED;
+typedef enum {
+    /* set: pass void* pointing to dev memory */
+    /* get: pass void** pointing to host memory */
+    CUDNN_PTR_XDATA              = 0,
+    CUDNN_PTR_BN_EQSCALE         = 1,
+    CUDNN_PTR_BN_EQBIAS          = 2,
+    CUDNN_PTR_WDATA              = 3,
+    CUDNN_PTR_DWDATA             = 4,
+    CUDNN_PTR_YDATA              = 5,
+    CUDNN_PTR_DYDATA             = 6,
+    CUDNN_PTR_YSUM               = 7,
+    CUDNN_PTR_YSQSUM             = 8,
+    CUDNN_PTR_WORKSPACE          = 9,
+    CUDNN_PTR_BN_SCALE           = 10,
+    CUDNN_PTR_BN_BIAS            = 11,
+    CUDNN_PTR_BN_SAVED_MEAN      = 12,
+    CUDNN_PTR_BN_SAVED_INVSTD    = 13,
+    CUDNN_PTR_BN_RUNNING_MEAN    = 14,
+    CUDNN_PTR_BN_RUNNING_VAR     = 15,
+    CUDNN_PTR_ZDATA              = 16,
+    CUDNN_PTR_BN_Z_EQSCALE       = 17,
+    CUDNN_PTR_BN_Z_EQBIAS        = 18,
+    CUDNN_PTR_ACTIVATION_BITMASK = 19,
+    CUDNN_PTR_DXDATA             = 20,
+    CUDNN_PTR_DZDATA             = 21,
+    CUDNN_PTR_BN_DSCALE          = 22,
+    CUDNN_PTR_BN_DBIAS           = 23,
+    /* set/get: pass size_t* pointing to host memory */
+    CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100,
+    /* set/get: pass int64_t* pointing to host memory */
+    CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102,
+    /* set/get: pass double* pointing to host memory */
+    CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103,
+} cudnnFusedOpsVariantParamLabel_t CUDNN_DEPRECATED;
+cudnnStatus_t CUDNNWINAPI
+cudnnCnnVersionCheck(void);
+/* helper function to provide the convolution backward filter algo that fit best the requirement */
+typedef struct cudnnConvolutionBwdFilterAlgoPerfStruct {
+    cudnnConvolutionBwdFilterAlgo_t algo;
+    cudnnStatus_t status;
+    float time;
+    size_t memory;
+    cudnnDeterminism_t determinism;
+    cudnnMathType_t mathType;
+    int reserved[3];
+} cudnnConvolutionBwdFilterAlgoPerf_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults);
+/*
+ *  convolution algorithm (which requires potentially some workspace)
+ */
+/* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw);
+/* Function to compute the bias gradient for batch convolution */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_CNN_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph.h ADDED Viewed

	@@ -0,0 +1,992 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_graph : cuDNN's basic definitions operations.
+ */
+#if !defined(CUDNN_GRAPH_H_)
+#define CUDNN_GRAPH_H_
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_GRAPH_MAJOR 9
+#define CUDNN_GRAPH_MINOR 10
+#define CUDNN_GRAPH_PATCH 2
+#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN GRAPH!!!
+#endif
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#define CUDNN_DEPRECATED_ENUM [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#define CUDNN_DEPRECATED_ENUM
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS = 0,
+    /* Uncategorized errors */
+    CUDNN_STATUS_NOT_INITIALIZED                = 1001,
+    CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH    = 1002,
+    CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
+    CUDNN_STATUS_DEPRECATED                     = 1004,
+    CUDNN_STATUS_LICENSE_ERROR                  = 1005,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS            = 1006,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW            = 1007,
+    CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED      = 1008,
+    CUDNN_STATUS_BAD_PARAM                     = 2000,
+    CUDNN_STATUS_BAD_PARAM_NULL_POINTER        = 2002,
+    CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER  = 2003,
+    CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED       = 2004,
+    CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND        = 2005,
+    CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT   = 2006,
+    CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH     = 2007,
+    CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH      = 2008,
+    CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES  = 2009,
+    CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE      = 2010,
+    CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011,
+    CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE     = 2012,
+    CUDNN_STATUS_NOT_SUPPORTED                              = 3000,
+    CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN                = 3001,
+    CUDNN_STATUS_NOT_SUPPORTED_SHAPE                        = 3002,
+    CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE                    = 3003,
+    CUDNN_STATUS_NOT_SUPPORTED_LAYOUT                       = 3004,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER     = 3005,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART          = 3006,
+    CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH                = 3007,
+    CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
+    CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE       = 3009,
+    CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT   = 3010,
+    CUDNN_STATUS_NOT_SUPPORTED_PADDING                      = 3011,
+    CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM             = 3012,
+    CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API        = 3013,
+    CUDNN_STATUS_INTERNAL_ERROR                          = 4000,
+    CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED       = 4001,
+    CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE         = 4002,
+    CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED   = 4003,
+    CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
+    CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM         = 4005,
+    CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED  = 4006,
+    CUDNN_STATUS_EXECUTION_FAILED             = 5000,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
+    CUDNN_STATUS_EXECUTION_FAILED_CUBLAS      = 5002,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDART      = 5003,
+    CUDNN_STATUS_EXECUTION_FAILED_CURAND      = 5004,
+    CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM  = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
+    CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
+    CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
+    CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
+        CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
+    CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
+} cudnnStatus_t;
+#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
+#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
+#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+void CUDNNWINAPI
+cudnnGetLastErrorString(char *message, size_t max_size);
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT                         = 0,
+    CUDNN_DATA_DOUBLE                        = 1,
+    CUDNN_DATA_HALF                          = 2,
+    CUDNN_DATA_INT8                          = 3,
+    CUDNN_DATA_INT32                         = 4,
+    CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM  = 5,
+    CUDNN_DATA_UINT8                         = 6,
+    CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
+    CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
+    CUDNN_DATA_BFLOAT16                      = 9,
+    CUDNN_DATA_INT64                         = 10,
+    CUDNN_DATA_BOOLEAN                       = 11,
+    CUDNN_DATA_FP8_E4M3                      = 12,
+    CUDNN_DATA_FP8_E5M2                      = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8            = 14,
+    CUDNN_DATA_FP8_E8M0                      = 15,
+    CUDNN_DATA_FP4_E2M1                      = 16,
+} cudnnDataType_t;
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
+    CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM     = 1,
+} cudnnNanPropagation_t;
+/*
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
+ */
+typedef enum {
+    CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
+    CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
+} cudnnCTCGradMode_t;
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGraphVersionCheck(void);
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t CUDNN_DEPRECATED;
+typedef void *cudnnBackendDescriptor_t;
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+    CUDNN_POINTWISE_ABS        = 10,
+    CUDNN_POINTWISE_CEIL       = 11,
+    CUDNN_POINTWISE_COS        = 12,
+    CUDNN_POINTWISE_EXP        = 13,
+    CUDNN_POINTWISE_FLOOR      = 14,
+    CUDNN_POINTWISE_LOG        = 15,
+    CUDNN_POINTWISE_NEG        = 16,
+    CUDNN_POINTWISE_RSQRT      = 17,
+    CUDNN_POINTWISE_SIN        = 18,
+    CUDNN_POINTWISE_SQRT       = 4,
+    CUDNN_POINTWISE_TAN        = 19,
+    CUDNN_POINTWISE_ERF        = 20,
+    CUDNN_POINTWISE_IDENTITY   = 21,
+    CUDNN_POINTWISE_RECIPROCAL = 22,
+    CUDNN_POINTWISE_ATAN2      = 23,
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM   = 1,
+    CUDNN_RNG_DISTRIBUTION_NORMAL    = 2,
+} cudnnRngDistribution_t;
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC                             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP                       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP                       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE                 = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA                             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA                         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA                            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                                  = 9,
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
+    CUDNN_ATTR_ENGINEHEUR_DEVICEPROP      = 204,
+    CUDNN_ATTR_ENGINECFG_ENGINE             = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO  = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES       = 302,
+    CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE     = 303,
+    CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304,
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG                = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE               = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS   = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS   = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION          = 405,
+    CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE                 = 406,
+    CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP                   = 407,
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                          = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT          = 802,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED     = 803,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY             = 804,
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 913,
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
+    CUDNN_ATTR_ENGINE_DEVICEPROP      = 1307,
+    CUDNN_ATTR_MATMUL_COMP_TYPE     = 1500,
+    CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                                                 = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                                                 = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                                                 = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                                                  = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC                                  = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC                                  = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC                                  = 1527,
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC                       = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC                       = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC                     = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM  = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC                        = 1716,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC                      = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC                      = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC                     = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM  = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC                        = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC                       = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC                       = 1727,
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC  = 1950,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC           = 1951,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC   = 1952,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC                = 2250,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC                = 2251,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH      = 2252,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH      = 2253,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS                 = 2254,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE            = 2255,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC           = 2270,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC           = 2271,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS            = 2274,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE       = 2275,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE        = 2276,
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
+    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
+    CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH            = 2400,
+    CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401,
+    CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION        = 2402,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC      = 2500,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC      = 2501,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC  = 2503,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC      = 2600,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC      = 2602,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC  = 2603,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,
+    CUDNN_ATTR_DEVICEPROP_DEVICE_ID           = 2700,
+    CUDNN_ATTR_DEVICEPROP_HANDLE              = 2701,
+    CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,
+} cudnnBackendAttributeName_t;
+typedef enum {
+    CUDNN_TYPE_HANDLE                                = 0,
+    CUDNN_TYPE_DATA_TYPE                             = 1,
+    CUDNN_TYPE_BOOLEAN                               = 2,
+    CUDNN_TYPE_INT64                                 = 3,
+    CUDNN_TYPE_FLOAT                                 = 4,
+    CUDNN_TYPE_DOUBLE                                = 5,
+    CUDNN_TYPE_VOID_PTR                              = 6,
+    CUDNN_TYPE_CONVOLUTION_MODE                      = 7,
+    CUDNN_TYPE_HEUR_MODE                             = 8,
+    CUDNN_TYPE_KNOB_TYPE                             = 9,
+    CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10,
+    CUDNN_TYPE_NUMERICAL_NOTE                        = 11,
+    CUDNN_TYPE_LAYOUT_TYPE                           = 12,
+    CUDNN_TYPE_ATTRIB_NAME                           = 13,
+    CUDNN_TYPE_POINTWISE_MODE                        = 14,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR                    = 15,
+    CUDNN_TYPE_GENSTATS_MODE                         = 16,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE                = 17,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE               = 18,
+    CUDNN_TYPE_BEHAVIOR_NOTE                         = 19,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE                = 20,
+    CUDNN_TYPE_RESAMPLE_MODE                         = 21,
+    CUDNN_TYPE_PADDING_MODE                          = 22,
+    CUDNN_TYPE_INT32                                 = 23,
+    CUDNN_TYPE_CHAR                                  = 24,
+    CUDNN_TYPE_SIGNAL_MODE                           = 25,
+    CUDNN_TYPE_FRACTION                              = 26,
+    CUDNN_TYPE_NORM_MODE                             = 27,
+    CUDNN_TYPE_NORM_FWD_PHASE                        = 28,
+    CUDNN_TYPE_RNG_DISTRIBUTION                      = 29,
+} cudnnBackendAttributeType_t;
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR                             = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR                           = 1,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR                                = 2,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR                             = 3,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR                            = 4,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR                        = 5,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR                     = 6,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR                           = 7,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR                             = 8,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR                           = 9,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR         = 10,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR   = 12,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR                   = 13,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR                   = 14,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR                        = 15,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR                          = 16,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR                                = 17,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR                                = 18,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR                      = 19,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR      = 20,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR                             = 21,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR                   = 22,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR              = 23,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR                              = 24,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR                = 25,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR                = 26,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR                      = 27,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR                      = 28,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR                = 29,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR               = 30,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR                     = 31,
+    CUDNN_BACKEND_RNG_DESCRIPTOR                                   = 32,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR                         = 33,
+    CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR                          = 34,
+    CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR            = 35,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR        = 36,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR      = 37,
+    CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR                            = 38,
+    CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR          = 39,
+    CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR        = 40,
+} cudnnBackendDescriptorType_t;
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE                 = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS         = 1,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
+    CUDNN_NUMERICAL_NOTE_FFT                         = 3,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC            = 4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD                    = 5,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4           = 6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6           = 7,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13         = 8,
+    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP             = 9,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT                  = 10,
+} cudnnBackendNumericalNote_t;
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API  = 3,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT                      = 4,
+} cudnnBackendBehaviorNote_t;
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
+    CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM          = 3,
+    CUDNN_KNOB_TYPE_EDGE                                   = 4,
+    CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM           = 5,
+    CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM             = 6,
+    CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
+    CUDNN_KNOB_TYPE_TILEK                                  = 13,
+    CUDNN_KNOB_TYPE_STAGES                                 = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE                               = 18,
+    CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM     = 21,
+    CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
+    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
+    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
+    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
+    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
+    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
+    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
+    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
+    CUDNN_KNOB_TYPE_LOAD_SIZE                              = 36,
+    CUDNN_KNOB_TYPE_CTA_COUNT                              = 37,
+    CUDNN_KNOB_TYPE_STREAM_K                               = 38,
+    CUDNN_KNOB_TYPE_SPLIT_P_SLC                            = 39,
+    CUDNN_KNOB_TYPE_TILE_M                                 = 40,
+    CUDNN_KNOB_TYPE_TILE_N                                 = 41,
+    CUDNN_KNOB_TYPE_WARP_SPEC_CFG                          = 42,
+    CUDNN_KNOB_TYPE_COUNTS                                 = 43,
+} cudnnBackendKnobType_t;
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE     = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32  = 1,
+    CUDNN_TENSOR_REORDERING_F16x16   = 2,
+    CUDNN_TENSOR_REORDERING_F8_128x4 = 3,
+} cudnnBackendTensorReordering_t;
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+typedef enum {
+    CUDNN_LAYER_NORM     = 0,
+    CUDNN_INSTANCE_NORM  = 1,
+    CUDNN_BATCH_NORM     = 2,
+    CUDNN_GROUP_NORM     = 3,
+    CUDNN_RMS_NORM       = 4,
+    CUDNN_ADA_LAYER_NORM = 5,
+} cudnnBackendNormMode_t;
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendPopulateCudaGraph(cudnnHandle_t handle,
+                              cudnnBackendDescriptor_t executionPlan,
+                              cudnnBackendDescriptor_t variantPack,
+                              cudaGraph_t graph);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendUpdateCudaGraph(cudnnHandle_t handle,
+                            cudnnBackendDescriptor_t executionPlan,
+                            cudnnBackendDescriptor_t variantPack,
+                            cudaGraph_t graph);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_GRAPH_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_graph_v9.h ADDED Viewed

	@@ -0,0 +1,992 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_graph : cuDNN's basic definitions operations.
+ */
+#if !defined(CUDNN_GRAPH_H_)
+#define CUDNN_GRAPH_H_
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_GRAPH_MAJOR 9
+#define CUDNN_GRAPH_MINOR 10
+#define CUDNN_GRAPH_PATCH 2
+#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN GRAPH!!!
+#endif
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#define CUDNN_DEPRECATED_ENUM [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#define CUDNN_DEPRECATED_ENUM
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS = 0,
+    /* Uncategorized errors */
+    CUDNN_STATUS_NOT_INITIALIZED                = 1001,
+    CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH    = 1002,
+    CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
+    CUDNN_STATUS_DEPRECATED                     = 1004,
+    CUDNN_STATUS_LICENSE_ERROR                  = 1005,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS            = 1006,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW            = 1007,
+    CUDNN_STATUS_SUBLIBRARY_LOADING_FAILED      = 1008,
+    CUDNN_STATUS_BAD_PARAM                     = 2000,
+    CUDNN_STATUS_BAD_PARAM_NULL_POINTER        = 2002,
+    CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER  = 2003,
+    CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED       = 2004,
+    CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND        = 2005,
+    CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT   = 2006,
+    CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH     = 2007,
+    CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH      = 2008,
+    CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES  = 2009,
+    CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE      = 2010,
+    CUDNN_STATUS_BAD_PARAM_CUDA_GRAPH_MISMATCH = 2011,
+    CUDNN_STATUS_BAD_PARAM_DESCRIPTOR_TYPE     = 2012,
+    CUDNN_STATUS_NOT_SUPPORTED                              = 3000,
+    CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN                = 3001,
+    CUDNN_STATUS_NOT_SUPPORTED_SHAPE                        = 3002,
+    CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE                    = 3003,
+    CUDNN_STATUS_NOT_SUPPORTED_LAYOUT                       = 3004,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER     = 3005,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART          = 3006,
+    CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH                = 3007,
+    CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
+    CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE       = 3009,
+    CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT   = 3010,
+    CUDNN_STATUS_NOT_SUPPORTED_PADDING                      = 3011,
+    CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM             = 3012,
+    CUDNN_STATUS_NOT_SUPPORTED_CUDA_GRAPH_NATIVE_API        = 3013,
+    CUDNN_STATUS_INTERNAL_ERROR                          = 4000,
+    CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED       = 4001,
+    CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE         = 4002,
+    CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED   = 4003,
+    CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
+    CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM         = 4005,
+    CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED  = 4006,
+    CUDNN_STATUS_EXECUTION_FAILED             = 5000,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
+    CUDNN_STATUS_EXECUTION_FAILED_CUBLAS      = 5002,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDART      = 5003,
+    CUDNN_STATUS_EXECUTION_FAILED_CURAND      = 5004,
+    CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM  = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
+    CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
+    CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
+    CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
+        CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
+    CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
+} cudnnStatus_t;
+#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
+#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
+#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+void CUDNNWINAPI
+cudnnGetLastErrorString(char *message, size_t max_size);
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT                         = 0,
+    CUDNN_DATA_DOUBLE                        = 1,
+    CUDNN_DATA_HALF                          = 2,
+    CUDNN_DATA_INT8                          = 3,
+    CUDNN_DATA_INT32                         = 4,
+    CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM  = 5,
+    CUDNN_DATA_UINT8                         = 6,
+    CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
+    CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
+    CUDNN_DATA_BFLOAT16                      = 9,
+    CUDNN_DATA_INT64                         = 10,
+    CUDNN_DATA_BOOLEAN                       = 11,
+    CUDNN_DATA_FP8_E4M3                      = 12,
+    CUDNN_DATA_FP8_E5M2                      = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8            = 14,
+    CUDNN_DATA_FP8_E8M0                      = 15,
+    CUDNN_DATA_FP4_E2M1                      = 16,
+} cudnnDataType_t;
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
+    CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM     = 1,
+} cudnnNanPropagation_t;
+/*
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
+ */
+typedef enum {
+    CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
+    CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
+} cudnnCTCGradMode_t;
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGraphVersionCheck(void);
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t CUDNN_DEPRECATED;
+typedef void *cudnnBackendDescriptor_t;
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+    CUDNN_POINTWISE_ABS        = 10,
+    CUDNN_POINTWISE_CEIL       = 11,
+    CUDNN_POINTWISE_COS        = 12,
+    CUDNN_POINTWISE_EXP        = 13,
+    CUDNN_POINTWISE_FLOOR      = 14,
+    CUDNN_POINTWISE_LOG        = 15,
+    CUDNN_POINTWISE_NEG        = 16,
+    CUDNN_POINTWISE_RSQRT      = 17,
+    CUDNN_POINTWISE_SIN        = 18,
+    CUDNN_POINTWISE_SQRT       = 4,
+    CUDNN_POINTWISE_TAN        = 19,
+    CUDNN_POINTWISE_ERF        = 20,
+    CUDNN_POINTWISE_IDENTITY   = 21,
+    CUDNN_POINTWISE_RECIPROCAL = 22,
+    CUDNN_POINTWISE_ATAN2      = 23,
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI = 0,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM   = 1,
+    CUDNN_RNG_DISTRIBUTION_NORMAL    = 2,
+} cudnnRngDistribution_t;
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC                             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP                       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP                       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE                 = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA                             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA                         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA                            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                                  = 9,
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
+    CUDNN_ATTR_ENGINEHEUR_DEVICEPROP      = 204,
+    CUDNN_ATTR_ENGINECFG_ENGINE             = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO  = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES       = 302,
+    CUDNN_ATTR_ENGINECFG_WORKSPACE_SIZE     = 303,
+    CUDNN_ATTR_ENGINECFG_SHARED_MEMORY_USED = 304,
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE CUDNN_DEPRECATED_ENUM = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG                = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE               = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS   = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS   = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION          = 405,
+    CUDNN_ATTR_EXECUTION_PLAN_KERNEL_CACHE                 = 406,
+    CUDNN_ATTR_EXECUTION_PLAN_DEVICEPROP                   = 407,
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE CUDNN_DEPRECATED_ENUM = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                          = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT          = 802,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_DYNAMIC_SHAPE_ENABLED     = 803,
+    CUDNN_ATTR_OPERATIONGRAPH_IS_SAME_TOPOLOGY             = 804,
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 913,
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
+    CUDNN_ATTR_ENGINE_DEVICEPROP      = 1307,
+    CUDNN_ATTR_MATMUL_COMP_TYPE     = 1500,
+    CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                                                 = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                                                 = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                                                 = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                                                  = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC                                  = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC                                  = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC                                  = 1527,
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC                       = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC                       = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC                     = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM  = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC                        = 1716,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC                      = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC                      = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC                     = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM  = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC                        = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC                       = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC                       = 1727,
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_CONTAINER_DESC  = 1950,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_YDESC           = 1951,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_SEQUENCE_DESC   = 1952,
+    CUDNN_ATTR_OPERATION_PAGED_CACHE_LOAD_PAGE_TABLE_DESC = 1953,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_XDESC                = 2250,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_YDESC                = 2251,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_LOWER_BANDWIDTH      = 2252,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_UPPER_BANDWIDTH      = 2253,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_AXIS                 = 2254,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_PAD_VALUE            = 2255,
+    CUDNN_ATTR_OPERATION_EXPAND_BAND_MATRIX_KV_TOKEN_OFFSET_DESC = 2256,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_XDESC           = 2270,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_YDESC           = 2271,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_LOWER_BANDWIDTH = 2272,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_UPPER_BANDWIDTH = 2273,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_AXIS            = 2274,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MATRIX_PAD_VALUE       = 2275,
+    CUDNN_ATTR_OPERATION_CONTRACT_BAND_MAX_TOKEN_VALUE        = 2276,
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
+    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
+    CUDNN_ATTR_KERNEL_CACHE_OPERATION_GRAPH            = 2400,
+    CUDNN_ATTR_KERNEL_CACHE_IS_ENGINECFG_KERNEL_CACHED = 2401,
+    CUDNN_ATTR_KERNEL_CACHE_JSON_REPRESENTATION        = 2402,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_XDESC      = 2500,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_YDESC      = 2501,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_SCALE_DESC = 2502,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_MATH_PREC  = 2503,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_QUANTIZE_BLOCK_SIZE = 2504,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_XDESC      = 2600,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_SCALE_DESC = 2601,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_YDESC      = 2602,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_MATH_PREC  = 2603,
+    CUDNN_ATTR_OPERATION_BLOCK_SCALE_DEQUANTIZE_BLOCK_SIZE = 2604,
+    CUDNN_ATTR_DEVICEPROP_DEVICE_ID           = 2700,
+    CUDNN_ATTR_DEVICEPROP_HANDLE              = 2701,
+    CUDNN_ATTR_DEVICEPROP_JSON_REPRESENTATION = 2702,
+} cudnnBackendAttributeName_t;
+typedef enum {
+    CUDNN_TYPE_HANDLE                                = 0,
+    CUDNN_TYPE_DATA_TYPE                             = 1,
+    CUDNN_TYPE_BOOLEAN                               = 2,
+    CUDNN_TYPE_INT64                                 = 3,
+    CUDNN_TYPE_FLOAT                                 = 4,
+    CUDNN_TYPE_DOUBLE                                = 5,
+    CUDNN_TYPE_VOID_PTR                              = 6,
+    CUDNN_TYPE_CONVOLUTION_MODE                      = 7,
+    CUDNN_TYPE_HEUR_MODE                             = 8,
+    CUDNN_TYPE_KNOB_TYPE                             = 9,
+    CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM = 10,
+    CUDNN_TYPE_NUMERICAL_NOTE                        = 11,
+    CUDNN_TYPE_LAYOUT_TYPE                           = 12,
+    CUDNN_TYPE_ATTRIB_NAME                           = 13,
+    CUDNN_TYPE_POINTWISE_MODE                        = 14,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR                    = 15,
+    CUDNN_TYPE_GENSTATS_MODE                         = 16,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE                = 17,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE               = 18,
+    CUDNN_TYPE_BEHAVIOR_NOTE                         = 19,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE                = 20,
+    CUDNN_TYPE_RESAMPLE_MODE                         = 21,
+    CUDNN_TYPE_PADDING_MODE                          = 22,
+    CUDNN_TYPE_INT32                                 = 23,
+    CUDNN_TYPE_CHAR                                  = 24,
+    CUDNN_TYPE_SIGNAL_MODE                           = 25,
+    CUDNN_TYPE_FRACTION                              = 26,
+    CUDNN_TYPE_NORM_MODE                             = 27,
+    CUDNN_TYPE_NORM_FWD_PHASE                        = 28,
+    CUDNN_TYPE_RNG_DISTRIBUTION                      = 29,
+} cudnnBackendAttributeType_t;
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR                             = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR                           = 1,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR                                = 2,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR                             = 3,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR                            = 4,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR                        = 5,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR                     = 6,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR                           = 7,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR                             = 8,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR                           = 9,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR         = 10,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR = 11,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR   = 12,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR                   = 13,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR                   = 14,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR                        = 15,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR                          = 16,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR                                = 17,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR                                = 18,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR                      = 19,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR      = 20,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR                             = 21,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR                   = 22,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR              = 23,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR                              = 24,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR                = 25,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR                = 26,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR                      = 27,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR                      = 28,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR                = 29,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR               = 30,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR                     = 31,
+    CUDNN_BACKEND_RNG_DESCRIPTOR                                   = 32,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR                         = 33,
+    CUDNN_BACKEND_KERNEL_CACHE_DESCRIPTOR                          = 34,
+    CUDNN_BACKEND_OPERATION_PAGED_CACHE_LOAD_DESCRIPTOR            = 35,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_QUANTIZE_DESCRIPTOR        = 36,
+    CUDNN_BACKEND_OPERATION_BLOCK_SCALE_DEQUANTIZE_DESCRIPTOR      = 37,
+    CUDNN_BACKEND_DEVICEPROP_DESCRIPTOR                            = 38,
+    CUDNN_BACKEND_OPERATION_EXPAND_BAND_MATRIX_DESCRIPTOR          = 39,
+    CUDNN_BACKEND_OPERATION_CONTRACT_BAND_MATRIX_DESCRIPTOR        = 40,
+} cudnnBackendDescriptorType_t;
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE                 = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS         = 1,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION = 2,
+    CUDNN_NUMERICAL_NOTE_FFT                         = 3,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC            = 4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD                    = 5,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4           = 6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6           = 7,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13         = 8,
+    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP             = 9,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT                  = 10,
+} cudnnBackendNumericalNote_t;
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_SUPPORTS_CUDA_GRAPH_NATIVE_API  = 3,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT                      = 4,
+} cudnnBackendBehaviorNote_t;
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
+    CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM          = 3,
+    CUDNN_KNOB_TYPE_EDGE                                   = 4,
+    CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM           = 5,
+    CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM             = 6,
+    CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
+    CUDNN_KNOB_TYPE_TILEK                                  = 13,
+    CUDNN_KNOB_TYPE_STAGES                                 = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE                               = 18,
+    CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM     = 21,
+    CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
+    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
+    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
+    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
+    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
+    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
+    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
+    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
+    CUDNN_KNOB_TYPE_LOAD_SIZE                              = 36,
+    CUDNN_KNOB_TYPE_CTA_COUNT                              = 37,
+    CUDNN_KNOB_TYPE_STREAM_K                               = 38,
+    CUDNN_KNOB_TYPE_SPLIT_P_SLC                            = 39,
+    CUDNN_KNOB_TYPE_TILE_M                                 = 40,
+    CUDNN_KNOB_TYPE_TILE_N                                 = 41,
+    CUDNN_KNOB_TYPE_WARP_SPEC_CFG                          = 42,
+    CUDNN_KNOB_TYPE_COUNTS                                 = 43,
+} cudnnBackendKnobType_t;
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE     = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32  = 1,
+    CUDNN_TENSOR_REORDERING_F16x16   = 2,
+    CUDNN_TENSOR_REORDERING_F8_128x4 = 3,
+} cudnnBackendTensorReordering_t;
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+typedef enum {
+    CUDNN_LAYER_NORM     = 0,
+    CUDNN_INSTANCE_NORM  = 1,
+    CUDNN_BATCH_NORM     = 2,
+    CUDNN_GROUP_NORM     = 3,
+    CUDNN_RMS_NORM       = 4,
+    CUDNN_ADA_LAYER_NORM = 5,
+} cudnnBackendNormMode_t;
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendPopulateCudaGraph(cudnnHandle_t handle,
+                              cudnnBackendDescriptor_t executionPlan,
+                              cudnnBackendDescriptor_t variantPack,
+                              cudaGraph_t graph);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendUpdateCudaGraph(cudnnHandle_t handle,
+                            cudnnBackendDescriptor_t executionPlan,
+                            cudnnBackendDescriptor_t variantPack,
+                            cudaGraph_t graph);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_GRAPH_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops.h ADDED Viewed

	@@ -0,0 +1,1316 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_ops : cuDNN's basic definitions and basic operations.
+ */
+#if !defined(CUDNN_OPS_H_)
+#define CUDNN_OPS_H_
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_MAJOR 9
+#define CUDNN_OPS_MINOR 10
+#define CUDNN_OPS_PATCH 2
+#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+*/
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+/** Create a destination descriptor for cudnnTransformTensor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+/** Create an empty tensor transform descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+/** Initialize a previously created tensor transform descriptor. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+/* Create an instance of FilterStruct */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t CUDNN_DEPRECATED;
+/* Create an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+/* Destroy an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform forward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+/* Function to perform forward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t CUDNN_DEPRECATED;
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t CUDNN_DEPRECATED;
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t CUDNN_DEPRECATED;
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t CUDNN_DEPRECATED;
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsVersionCheck(void);
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+ * bnScale gradient and bnBias gradient */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_OPS_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_ops_v9.h ADDED Viewed

	@@ -0,0 +1,1316 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_ops : cuDNN's basic definitions and basic operations.
+ */
+#if !defined(CUDNN_OPS_H_)
+#define CUDNN_OPS_H_
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_MAJOR 9
+#define CUDNN_OPS_MINOR 10
+#define CUDNN_OPS_PATCH 2
+#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+*/
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+/** Create a destination descriptor for cudnnTransformTensor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+/** Create an empty tensor transform descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+/** Initialize a previously created tensor transform descriptor. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+/* Create an instance of FilterStruct */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t CUDNN_DEPRECATED;
+/* Create an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+/* Destroy an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform forward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+/* Function to perform forward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t CUDNN_DEPRECATED;
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t CUDNN_DEPRECATED;
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t CUDNN_DEPRECATED;
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t CUDNN_DEPRECATED;
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsVersionCheck(void);
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+ * bnScale gradient and bnBias gradient */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_OPS_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_v9.h ADDED Viewed

	@@ -0,0 +1,68 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn : Neural Networks Library  */
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#include <cuda_runtime_api.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+#include "cudnn_ops.h"
+#include "cudnn_adv.h"
+#include "cudnn_cnn.h"
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/**
+ * \file: The master cuDNN version file.
+ */
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+#define CUDNN_MAJOR 9
+#define CUDNN_MINOR 10
+#define CUDNN_PATCHLEVEL 2
+#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+#define CUDNN_MAX_SM_MAJOR_NUMBER 12
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
+#endif /* CUDNN_VERSION_H */

.venv/lib/python3.12/site-packages/nvidia/cudnn/include/cudnn_version_v9.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/**
+ * \file: The master cuDNN version file.
+ */
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+#define CUDNN_MAJOR 9
+#define CUDNN_MINOR 10
+#define CUDNN_PATCHLEVEL 2
+#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+#define CUDNN_MAX_SM_MAJOR_NUMBER 12
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
+#endif /* CUDNN_VERSION_H */

.venv/lib/python3.12/site-packages/nvidia_nccl_cu12-2.27.3.dist-info/licenses/License.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+ Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+    Laboratory, the U.S. Department of Energy, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ The U.S. Department of Energy funded the development of this software
+ under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+This code also includes files from the NVIDIA Tools Extension SDK project.
+See:
+   https://github.com/NVIDIA/NVTX
+for more information and license details.

.venv/lib/python3.12/site-packages/sklearn/__check_build/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Module to give helpful messages to the user that did not
+compile scikit-learn properly.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+import os
+INPLACE_MSG = """
+It appears that you are importing a local scikit-learn source tree. For
+this, you need to have an inplace install. Maybe you are in the source
+directory and you need to try from another location."""
+STANDARD_MSG = """
+If you have used an installer, please check that it is suited for your
+Python version, your operating system and your platform."""
+def raise_build_error(e):
+    # Raise a comprehensible error and list the contents of the
+    # directory to help debugging on the mailing list.
+    local_dir = os.path.split(__file__)[0]
+    msg = STANDARD_MSG
+    if local_dir == "sklearn/__check_build":
+        # Picking up the local install: this will work only if the
+        # install is an 'inplace build'
+        msg = INPLACE_MSG
+    dir_content = list()
+    for i, filename in enumerate(os.listdir(local_dir)):
+        if (i + 1) % 3:
+            dir_content.append(filename.ljust(26))
+        else:
+            dir_content.append(filename + "\n")
+    raise ImportError(
+        """%s
+___________________________________________________________________________
+Contents of %s:
+%s
+___________________________________________________________________________
+It seems that scikit-learn has not been built correctly.
+If you have installed scikit-learn from source, please do not forget
+to build the package before using it. For detailed instructions, see:
+https://scikit-learn.org/dev/developers/advanced_installation.html#building-from-source
+%s"""
+        % (e, local_dir, "".join(dir_content).strip(), msg)
+    )
+try:
+    from ._check_build import check_build  # noqa: F401
+except ImportError as e:
+    raise_build_error(e)

.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.cpython-312-x86_64-linux-gnu.so ADDED Viewed

Binary file (45.3 kB). View file

.venv/lib/python3.12/site-packages/sklearn/__check_build/_check_build.pyx ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ def check_build():
2	+ return

.venv/lib/python3.12/site-packages/sklearn/__check_build/meson.build ADDED Viewed

	@@ -0,0 +1,6 @@

+py.extension_module(
+  '_check_build',
+  cython_gen.process('_check_build.pyx'),
+  install: true,
+  subdir: 'sklearn/__check_build',
+)

.venv/lib/python3.12/site-packages/sklearn/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (3.12 kB). View file

.venv/lib/python3.12/site-packages/sklearn/__pycache__/_built_with_meson.cpython-312.pyc ADDED Viewed

Binary file (193 Bytes). View file

.venv/lib/python3.12/site-packages/sklearn/__pycache__/_config.cpython-312.pyc ADDED Viewed

Binary file (14.2 kB). View file

.venv/lib/python3.12/site-packages/sklearn/__pycache__/_distributor_init.cpython-312.pyc ADDED Viewed

Binary file (550 Bytes). View file

.venv/lib/python3.12/site-packages/sklearn/__pycache__/base.cpython-312.pyc ADDED Viewed

Binary file (51.3 kB). View file

.venv/lib/python3.12/site-packages/sklearn/__pycache__/exceptions.cpython-312.pyc ADDED Viewed

Binary file (9.49 kB). View file

.venv/lib/python3.12/site-packages/sklearn/_build_utils/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.12/site-packages/sklearn/_build_utils/tempita.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+import argparse
+import os
+from Cython import Tempita as tempita
+# XXX: If this import ever fails (does it really?), vendor either
+# cython.tempita or numpy/npy_tempita.
+def process_tempita(fromfile, outfile=None):
+    """Process tempita templated file and write out the result.
+    The template file is expected to end in `.c.tp` or `.pyx.tp`:
+    E.g. processing `template.c.in` generates `template.c`.
+    """
+    with open(fromfile, "r", encoding="utf-8") as f:
+        template_content = f.read()
+    template = tempita.Template(template_content)
+    content = template.substitute()
+    with open(outfile, "w", encoding="utf-8") as f:
+        f.write(content)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("infile", type=str, help="Path to the input file")
+    parser.add_argument("-o", "--outdir", type=str, help="Path to the output directory")
+    parser.add_argument(
+        "-i",
+        "--ignore",
+        type=str,
+        help=(
+            "An ignored input - may be useful to add a "
+            "dependency between custom targets"
+        ),
+    )
+    args = parser.parse_args()
+    if not args.infile.endswith(".tp"):
+        raise ValueError(f"Unexpected extension: {args.infile}")
+    if not args.outdir:
+        raise ValueError("Missing `--outdir` argument to tempita.py")
+    outdir_abs = os.path.join(os.getcwd(), args.outdir)
+    outfile = os.path.join(
+        outdir_abs, os.path.splitext(os.path.split(args.infile)[1])[0]
+    )
+    process_tempita(args.infile, outfile)
+if __name__ == "__main__":
+    main()

.venv/lib/python3.12/site-packages/sklearn/_build_utils/version.py ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env python3
+"""Extract version number from __init__.py"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+import os
+sklearn_init = os.path.join(os.path.dirname(__file__), "../__init__.py")
+data = open(sklearn_init).readlines()
+version_line = next(line for line in data if line.startswith("__version__"))
+version = version_line.strip().split(" = ")[1].replace('"', "").replace("'", "")
+print(version)

.venv/lib/python3.12/site-packages/sklearn/_loss/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+The :mod:`sklearn._loss` module includes loss function classes suitable for
+fitting classification and regression tasks.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+from .loss import (
+    AbsoluteError,
+    HalfBinomialLoss,
+    HalfGammaLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    HalfTweedieLossIdentity,
+    HuberLoss,
+    PinballLoss,
+)
+__all__ = [
+    "AbsoluteError",
+    "HalfBinomialLoss",
+    "HalfGammaLoss",
+    "HalfMultinomialLoss",
+    "HalfPoissonLoss",
+    "HalfSquaredError",
+    "HalfTweedieLoss",
+    "HalfTweedieLossIdentity",
+    "HuberLoss",
+    "PinballLoss",
+]

.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pxd ADDED Viewed

	@@ -0,0 +1,101 @@

+# Fused types for input like y_true, raw_prediction, sample_weights.
+ctypedef fused floating_in:
+    double
+    float
+# Fused types for output like gradient and hessian
+# We use a different fused types for input (floating_in) and output (floating_out), such
+# that input and output can have different dtypes in the same function call. A single
+# fused type can only take on one single value (type) for all arguments in one function
+# call.
+ctypedef fused floating_out:
+    double
+    float
+# Struct to return 2 doubles
+ctypedef struct double_pair:
+    double val1
+    double val2
+# C base class for loss functions
+cdef class CyLossFunction:
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfSquaredError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyAbsoluteError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyPinballLoss(CyLossFunction):
+    cdef readonly double quantile  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHuberLoss(CyLossFunction):
+    cdef public double delta  # public makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfPoissonLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfGammaLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfTweedieLoss(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfTweedieLossIdentity(CyLossFunction):
+    cdef readonly double power  # readonly makes it accessible from Python
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfBinomialLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyExponentialLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil
+cdef class CyHalfMultinomialLoss():
+    cdef void cy_gradient(
+        self,
+        const floating_in y_true,
+        const floating_in[::1] raw_prediction,
+        const floating_in sample_weight,
+        floating_out[::1] gradient_out,
+    ) noexcept nogil

.venv/lib/python3.12/site-packages/sklearn/_loss/_loss.pyx.tp ADDED Viewed

	@@ -0,0 +1,1505 @@

+{{py:
+"""
+Template file to easily generate loops over samples using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+Generated file: _loss.pyx
+Each loss class is generated by a cdef functions on single samples.
+The keywords between double braces are substituted during the build.
+"""
+doc_HalfSquaredError = (
+    """Half Squared Error with identity link.
+    Domain:
+    y_true and y_pred all real numbers
+    Link:
+    y_pred = raw_prediction
+    """
+)
+doc_AbsoluteError = (
+    """Absolute Error with identity link.
+    Domain:
+    y_true and y_pred all real numbers
+    Link:
+    y_pred = raw_prediction
+    """
+)
+doc_PinballLoss = (
+    """Quantile Loss aka Pinball Loss with identity link.
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+    Link:
+    y_pred = raw_prediction
+    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
+    """
+)
+doc_HuberLoss = (
+    """Huber Loss with identity link.
+    Domain:
+    y_true and y_pred all real numbers
+    delta in positive real numbers
+    Link:
+    y_pred = raw_prediction
+    """
+)
+doc_HalfPoissonLoss = (
+    """Half Poisson deviance loss with log-link.
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+    Link:
+    y_pred = exp(raw_prediction)
+    Half Poisson deviance with log-link is
+        y_true * log(y_true/y_pred) + y_pred - y_true
+        = y_true * log(y_true) - y_true * raw_prediction
+          + exp(raw_prediction) - y_true
+    Dropping constant terms, this gives:
+        exp(raw_prediction) - y_true * raw_prediction
+    """
+)
+doc_HalfGammaLoss = (
+    """Half Gamma deviance loss with log-link.
+    Domain:
+    y_true and y_pred in positive real numbers
+    Link:
+    y_pred = exp(raw_prediction)
+    Half Gamma deviance with log-link is
+        log(y_pred/y_true) + y_true/y_pred - 1
+        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
+    Dropping constant terms, this gives:
+        raw_prediction + y_true * exp(-raw_prediction)
+    """
+)
+doc_HalfTweedieLoss = (
+    """Half Tweedie deviance loss with log-link.
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers
+    Link:
+    y_pred = exp(raw_prediction)
+    Half Tweedie deviance with log-link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+    Dropping constant terms, this gives:
+        exp((2-p) * raw_prediction) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+    Notes:
+    - Poisson with p=1 and Gamma with p=2 have different terms dropped such
+      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
+    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
+      0<p<1 still gives a strictly consistent scoring function for the
+      expectation.
+    """
+)
+doc_HalfTweedieLossIdentity = (
+    """Half Tweedie deviance loss with identity link.
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers, y_pred may be negative for p=0.
+    Link:
+    y_pred = raw_prediction
+    Half Tweedie deviance with identity link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+    Notes:
+    - Here, we do not drop constant terms in contrast to the version with log-link.
+    """
+)
+doc_HalfBinomialLoss = (
+    """Half Binomial deviance loss with logit link.
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+    Link:
+    y_pred = expit(raw_prediction)
+    """
+)
+doc_ExponentialLoss = (
+    """"Exponential loss with (half) logit link
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+    Link:
+    y_pred = expit(2 * raw_prediction)
+    """
+)
+# loss class name, docstring, param,
+# cy_loss, cy_loss_grad,
+# cy_grad, cy_grad_hess,
+class_list = [
+    ("CyHalfSquaredError", doc_HalfSquaredError, None,
+     "closs_half_squared_error", None,
+     "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
+    ("CyAbsoluteError", doc_AbsoluteError, None,
+     "closs_absolute_error", None,
+     "cgradient_absolute_error", "cgrad_hess_absolute_error"),
+    ("CyPinballLoss", doc_PinballLoss, "quantile",
+     "closs_pinball_loss", None,
+     "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
+     ("CyHuberLoss", doc_HuberLoss, "delta",
+     "closs_huber_loss", None,
+     "cgradient_huber_loss", "cgrad_hess_huber_loss"),
+    ("CyHalfPoissonLoss", doc_HalfPoissonLoss, None,
+     "closs_half_poisson", "closs_grad_half_poisson",
+     "cgradient_half_poisson", "cgrad_hess_half_poisson"),
+    ("CyHalfGammaLoss", doc_HalfGammaLoss, None,
+     "closs_half_gamma", "closs_grad_half_gamma",
+     "cgradient_half_gamma", "cgrad_hess_half_gamma"),
+    ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
+     "closs_half_tweedie", "closs_grad_half_tweedie",
+     "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
+    ("CyHalfTweedieLossIdentity", doc_HalfTweedieLossIdentity, "power",
+     "closs_half_tweedie_identity", "closs_grad_half_tweedie_identity",
+     "cgradient_half_tweedie_identity", "cgrad_hess_half_tweedie_identity"),
+    ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
+     "closs_half_binomial", "closs_grad_half_binomial",
+     "cgradient_half_binomial", "cgrad_hess_half_binomial"),
+     ("CyExponentialLoss", doc_ExponentialLoss, None,
+     "closs_exponential", "closs_grad_exponential",
+     "cgradient_exponential", "cgrad_hess_exponential"),
+]
+}}
+# Design:
+# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
+# a) Merge link functions into loss functions for speed and numerical
+#    stability, i.e. use raw_prediction instead of y_pred in signature.
+# b) Pure C functions (nogil) calculate single points (single sample)
+# c) Wrap C functions in a loop to get Python functions operating on ndarrays.
+#   - Write loops manually---use Tempita for this.
+#     Reason: There is still some performance overhead when using a wrapper
+#     function "wrap" that carries out the loop and gets as argument a function
+#     pointer to one of the C functions from b), e.g.
+#     wrap(closs_half_poisson, y_true, ...)
+#   - Pass n_threads as argument to prange and propagate option to all callers.
+# d) Provide classes (Cython extension types) per loss (names start with Cy) in
+#    order to have semantical structured objects.
+#    - Member functions for single points just call the C function from b).
+#      These are used e.g. in SGD `_plain_sgd`.
+#    - Member functions operating on ndarrays, see c), looping over calls to C
+#      functions from b).
+# e) Provide convenience Python classes that compose from these extension types
+#    elsewhere (see loss.py)
+#    - Example: loss.gradient calls CyLoss.gradient but does some input
+#      checking like None -> np.empty().
+#
+# Note: We require 1-dim ndarrays to be contiguous.
+from cython.parallel import parallel, prange
+import numpy as np
+from libc.math cimport exp, fabs, log, log1p, pow
+from libc.stdlib cimport malloc, free
+# -------------------------------------
+# Helper functions
+# -------------------------------------
+# Numerically stable version of log(1 + exp(x)) for double precision, see Eq. (10) of
+# https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
+# Note: The only important cutoff is at x = 18. All others are to save computation
+# time. Compared to the reference, we add the additional case distinction x <= -2 in
+# order to use log instead of log1p for improved performance. As with the other
+# cutoffs, this is accurate within machine precision of double.
+cdef inline double log1pexp(double x) noexcept nogil:
+    if x <= -37:
+        return exp(x)
+    elif x <= -2:
+        return log1p(exp(x))
+    elif x <= 18:
+        return log(1. + exp(x))
+    elif x <= 33.3:
+        return x + exp(-x)
+    else:
+        return x
+cdef inline double_pair sum_exp_minus_max(
+    const int i,
+    const floating_in[:, :] raw_prediction,  # IN
+    floating_out *p                           # OUT
+) noexcept nogil:
+    # Thread local buffers are used to store part of the results via p.
+    # The results are stored as follows:
+    #     p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
+    #     return.val1 = max_value = max(raw_prediction_i_k, k = 0 to n_classes-1)
+    #     return.val2 = sum_exps = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
+    # len(p) must be n_classes
+    # Notes:
+    # - We return the max value and sum of exps (stored in p) as a double_pair.
+    # - i needs to be passed (and stays constant) because otherwise Cython does
+    #   not generate optimal code, see
+    #   https://github.com/scikit-learn/scikit-learn/issues/17299
+    # - We do not normalize p by calculating p[k] = p[k] / sum_exps.
+    #   This helps to save one loop over k.
+    cdef:
+        int k
+        int n_classes = raw_prediction.shape[1]
+        double_pair max_value_and_sum_exps  # val1 = max_value, val2 = sum_exps
+    max_value_and_sum_exps.val1 = raw_prediction[i, 0]
+    max_value_and_sum_exps.val2 = 0
+    for k in range(1, n_classes):
+        # Compute max value of array for numerical stability
+        if max_value_and_sum_exps.val1 < raw_prediction[i, k]:
+            max_value_and_sum_exps.val1 = raw_prediction[i, k]
+    for k in range(n_classes):
+        p[k] = exp(raw_prediction[i, k] - max_value_and_sum_exps.val1)
+        max_value_and_sum_exps.val2 += p[k]
+    return max_value_and_sum_exps
+# -------------------------------------
+# Single point inline C functions
+# -------------------------------------
+# Half Squared Error
+cdef inline double closs_half_squared_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
+cdef inline double cgradient_half_squared_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return raw_prediction - y_true
+cdef inline double_pair cgrad_hess_half_squared_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val1 = raw_prediction - y_true  # gradient
+    gh.val2 = 1.                       # hessian
+    return gh
+# Absolute Error
+cdef inline double closs_absolute_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return fabs(raw_prediction - y_true)
+cdef inline double cgradient_absolute_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return 1. if raw_prediction > y_true else -1.
+cdef inline double_pair cgrad_hess_absolute_error(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = 1. if raw_prediction > y_true else -1.  # gradient
+    gh.val2 = 1.                                      # hessian
+    return gh
+# Quantile Loss / Pinball Loss
+cdef inline double closs_pinball_loss(
+    double y_true,
+    double raw_prediction,
+    double quantile
+) noexcept nogil:
+    return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
+            else (1. - quantile) * (raw_prediction - y_true))
+cdef inline double cgradient_pinball_loss(
+    double y_true,
+    double raw_prediction,
+    double quantile
+) noexcept nogil:
+    return -quantile if y_true >=raw_prediction else 1. - quantile
+cdef inline double_pair cgrad_hess_pinball_loss(
+    double y_true,
+    double raw_prediction,
+    double quantile
+) noexcept nogil:
+    cdef double_pair gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile  # gradient
+    gh.val2 = 1.                                                       # hessian
+    return gh
+# Huber Loss
+cdef inline double closs_huber_loss(
+    double y_true,
+    double raw_prediction,
+    double delta,
+) noexcept nogil:
+    cdef double abserr = fabs(y_true - raw_prediction)
+    if abserr <= delta:
+        return 0.5 * abserr**2
+    else:
+        return delta * (abserr - 0.5 * delta)
+cdef inline double cgradient_huber_loss(
+    double y_true,
+    double raw_prediction,
+    double delta,
+) noexcept nogil:
+    cdef double res = raw_prediction - y_true
+    if fabs(res) <= delta:
+        return res
+    else:
+        return delta if res >=0 else -delta
+cdef inline double_pair cgrad_hess_huber_loss(
+    double y_true,
+    double raw_prediction,
+    double delta,
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val2 = raw_prediction - y_true               # used as temporary
+    if fabs(gh.val2) <= delta:
+        gh.val1 = gh.val2                           # gradient
+        gh.val2 = 1                                 # hessian
+    else:
+        gh.val1 = delta if gh.val2 >=0 else -delta  # gradient
+        gh.val2 = 0                                 # hessian
+    return gh
+# Half Poisson Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return exp(raw_prediction) - y_true * raw_prediction
+cdef inline double cgradient_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # y_pred - y_true
+    return exp(raw_prediction) - y_true
+cdef inline double_pair closs_grad_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    lg.val2 = exp(raw_prediction)                # used as temporary
+    lg.val1 = lg.val2 - y_true * raw_prediction  # loss
+    lg.val2 -= y_true                            # gradient
+    return lg
+cdef inline double_pair cgrad_hess_half_poisson(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val2 = exp(raw_prediction)  # hessian
+    gh.val1 = gh.val2 - y_true     # gradient
+    return gh
+# Half Gamma Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return raw_prediction + y_true * exp(-raw_prediction)
+cdef inline double cgradient_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    return 1. - y_true * exp(-raw_prediction)
+cdef inline double_pair closs_grad_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    lg.val2 = exp(-raw_prediction)               # used as temporary
+    lg.val1 = raw_prediction + y_true * lg.val2  # loss
+    lg.val2 = 1. - y_true * lg.val2              # gradient
+    return lg
+cdef inline double_pair cgrad_hess_half_gamma(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair gh
+    gh.val2 = exp(-raw_prediction)   # used as temporary
+    gh.val1 = 1. - y_true * gh.val2  # gradient
+    gh.val2 *= y_true                # hessian
+    return gh
+# Half Tweedie Deviance with Log-Link, dropping constant terms
+# Note that by dropping constants this is no longer continuous in parameter power.
+cdef inline double closs_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    if power == 0.:
+        return closs_half_squared_error(y_true, exp(raw_prediction))
+    elif power == 1.:
+        return closs_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction) / (2. - power)
+                - y_true * exp((1. - power) * raw_prediction) / (1. - power))
+cdef inline double cgradient_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double exp1
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        return exp1 * (exp1 - y_true)
+    elif power == 1.:
+        return cgradient_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgradient_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction)
+                - y_true * exp((1. - power) * raw_prediction))
+cdef inline double_pair closs_grad_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair lg
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        lg.val1 = closs_half_squared_error(y_true, exp1)  # loss
+        lg.val2 = exp1 * (exp1 - y_true)                  # gradient
+    elif power == 1.:
+        return closs_grad_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_grad_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power)  # loss
+        lg.val2 = exp2 - y_true * exp1                                # gradient
+    return lg
+cdef inline double_pair cgrad_hess_half_tweedie(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair gh
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        gh.val1 = exp1 * (exp1 - y_true)      # gradient
+        gh.val2 = exp1 * (2 * exp1 - y_true)  # hessian
+    elif power == 1.:
+        return cgrad_hess_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgrad_hess_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        gh.val1 = exp2 - y_true * exp1                                # gradient
+        gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1  # hessian
+    return gh
+# Half Tweedie Deviance with identity link, without dropping constant terms!
+# Therefore, best loss value is zero.
+cdef inline double closs_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double tmp
+    if power == 0.:
+        return closs_half_squared_error(y_true, raw_prediction)
+    elif power == 1.:
+        if y_true == 0:
+            return raw_prediction
+        else:
+            return y_true * log(y_true/raw_prediction) + raw_prediction - y_true
+    elif power == 2.:
+        return log(raw_prediction/y_true) + y_true/raw_prediction - 1.
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        tmp = raw_prediction * tmp / (2. - power) - y_true * tmp / (1. - power)
+        if y_true > 0:
+            tmp += pow(y_true, 2. - power) / ((1. - power) * (2. - power))
+        return tmp
+cdef inline double cgradient_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    if power == 0.:
+        return raw_prediction - y_true
+    elif power == 1.:
+        return 1. - y_true / raw_prediction
+    elif power == 2.:
+        return (raw_prediction - y_true) / (raw_prediction * raw_prediction)
+    else:
+        return pow(raw_prediction, -power) * (raw_prediction - y_true)
+cdef inline double_pair closs_grad_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair lg
+    cdef double tmp
+    if power == 0.:
+        lg.val2 = raw_prediction - y_true  # gradient
+        lg.val1 = 0.5 * lg.val2 * lg.val2  # loss
+    elif power == 1.:
+        if y_true == 0:
+            lg.val1 = raw_prediction
+        else:
+            lg.val1 = (y_true * log(y_true/raw_prediction)  # loss
+                       + raw_prediction - y_true)
+        lg.val2 = 1. - y_true / raw_prediction              # gradient
+    elif power == 2.:
+        lg.val1 = log(raw_prediction/y_true) + y_true/raw_prediction - 1.  # loss
+        tmp = raw_prediction * raw_prediction
+        lg.val2 = (raw_prediction - y_true) / tmp                          # gradient
+    else:
+        tmp = pow(raw_prediction, 1. - power)
+        lg.val1 = (raw_prediction * tmp / (2. - power)  # loss
+                   - y_true * tmp / (1. - power))
+        if y_true > 0:
+            lg.val1 += (pow(y_true, 2. - power)
+                        / ((1. - power) * (2. - power)))
+        lg.val2 = tmp * (1. - y_true / raw_prediction)    # gradient
+    return lg
+cdef inline double_pair cgrad_hess_half_tweedie_identity(
+    double y_true,
+    double raw_prediction,
+    double power
+) noexcept nogil:
+    cdef double_pair gh
+    cdef double tmp
+    if power == 0.:
+        gh.val1 = raw_prediction - y_true  # gradient
+        gh.val2 = 1.                       # hessian
+    elif power == 1.:
+        gh.val1 = 1. - y_true / raw_prediction                # gradient
+        gh.val2 = y_true / (raw_prediction * raw_prediction)  # hessian
+    elif power == 2.:
+        tmp = raw_prediction * raw_prediction
+        gh.val1 = (raw_prediction - y_true) / tmp             # gradient
+        gh.val2 = (-1. + 2. * y_true / raw_prediction) / tmp  # hessian
+    else:
+        tmp = pow(raw_prediction, -power)
+        gh.val1 = tmp * (raw_prediction - y_true)                         # gradient
+        gh.val2 = tmp * ((1. - power) + power * y_true / raw_prediction)  # hessian
+    return gh
+# Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
+cdef inline double closs_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # log1p(exp(raw_prediction)) - y_true * raw_prediction
+    return log1pexp(raw_prediction) - y_true * raw_prediction
+cdef inline double cgradient_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # gradient = y_pred - y_true = expit(raw_prediction) - y_true
+    # Numerically more stable, see http://fa.bianp.net/blog/2019/evaluate_logistic/
+    #     if raw_prediction < 0:
+    #         exp_tmp = exp(raw_prediction)
+    #         return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
+    #     else:
+    #         exp_tmp = exp(-raw_prediction)
+    #         return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    # Note that optimal speed would be achieved, at the cost of precision, by
+    #     return expit(raw_prediction) - y_true
+    # i.e. no "if else" and an own inline implementation of expit instead of
+    #     from scipy.special.cython_special cimport expit
+    # The case distinction raw_prediction < 0 in the stable implementation does not
+    # provide significant better precision apart from protecting overflow of exp(..).
+    # The branch (if else), however, can incur runtime costs of up to 30%.
+    # Instead, we help branch prediction by almost always ending in the first if clause
+    # and making the second branch (else) a bit simpler. This has the exact same
+    # precision but is faster than the stable implementation.
+    # As branching criteria, we use the same cutoff as in log1pexp. Note that the
+    # maximal value to get gradient = -1 with y_true = 1 is -37.439198610162731
+    # (based on mpmath), and scipy.special.logit(np.finfo(float).eps) ~ -36.04365.
+    cdef double exp_tmp
+    if raw_prediction > -37:
+        exp_tmp = exp(-raw_prediction)
+        return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    else:
+        # expit(raw_prediction) = exp(raw_prediction) for raw_prediction <= -37
+        return exp(raw_prediction) - y_true
+cdef inline double_pair closs_grad_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    # Same if else conditions as in log1pexp.
+    if raw_prediction <= -37:
+        lg.val2 = exp(raw_prediction)  # used as temporary
+        lg.val1 = lg.val2 - y_true * raw_prediction                  # loss
+        lg.val2 -= y_true                                            # gradient
+    elif raw_prediction <= -2:
+        lg.val2 = exp(raw_prediction)  # used as temporary
+        lg.val1 = log1p(lg.val2) - y_true * raw_prediction           # loss
+        lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
+    elif raw_prediction <= 18:
+        lg.val2 = exp(-raw_prediction)  # used as temporary
+        # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
+        lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction     # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)  # gradient
+    else:
+        lg.val2 = exp(-raw_prediction)  # used as temporary
+        lg.val1 = lg.val2 + (1 - y_true) * raw_prediction            # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)  # gradient
+    return lg
+cdef inline double_pair cgrad_hess_half_binomial(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # with y_pred = expit(raw)
+    # hessian = y_pred * (1 - y_pred) = exp( raw) / (1 + exp( raw))**2
+    #                                 = exp(-raw) / (1 + exp(-raw))**2
+    cdef double_pair gh
+    # See comment in cgradient_half_binomial.
+    if raw_prediction > -37:
+        gh.val2 = exp(-raw_prediction)  # used as temporary
+        gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
+        gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
+    else:
+        gh.val2 = exp(raw_prediction)  # = 1. order Taylor in exp(raw_prediction)
+        gh.val1 = gh.val2 - y_true
+    return gh
+# Exponential loss with (half) logit-link, aka boosting loss
+cdef inline double closs_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double tmp = exp(raw_prediction)
+    return y_true / tmp + (1 - y_true) * tmp
+cdef inline double cgradient_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double tmp = exp(raw_prediction)
+    return -y_true / tmp + (1 - y_true) * tmp
+cdef inline double_pair closs_grad_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    cdef double_pair lg
+    lg.val2 = exp(raw_prediction)  # used as temporary
+    lg.val1 =  y_true / lg.val2 + (1 - y_true) * lg.val2  # loss
+    lg.val2 = -y_true / lg.val2 + (1 - y_true) * lg.val2  # gradient
+    return lg
+cdef inline double_pair cgrad_hess_exponential(
+    double y_true,
+    double raw_prediction
+) noexcept nogil:
+    # Note that hessian = loss
+    cdef double_pair gh
+    gh.val2 = exp(raw_prediction)  # used as temporary
+    gh.val1 = -y_true / gh.val2 + (1 - y_true) * gh.val2  # gradient
+    gh.val2 =  y_true / gh.val2 + (1 - y_true) * gh.val2  # hessian
+    return gh
+# ---------------------------------------------------
+# Extension Types for Loss Functions of 1-dim targets
+# ---------------------------------------------------
+cdef class CyLossFunction:
+    """Base class for convex loss functions."""
+    def __reduce__(self):
+        return (self.__class__, ())
+    cdef double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
+        """Compute the loss for a single sample.
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+        Returns
+        -------
+        double
+            The loss evaluated at `y_true` and `raw_prediction`.
+        """
+        pass
+    cdef double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
+        """Compute gradient of loss w.r.t. raw_prediction for a single sample.
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+        Returns
+        -------
+        double
+            The derivative of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+    cdef double_pair cy_grad_hess(
+        self, double y_true, double raw_prediction
+    ) noexcept nogil:
+        """Compute gradient and hessian.
+        Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
+        This is usually diagonal in raw_prediction_i and raw_prediction_j.
+        Therefore, we return the diagonal element i=j.
+        For a loss with a non-canonical link, this might implement the diagonal
+        of the Fisher matrix (=expected hessian) instead of the hessian.
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+        Returns
+        -------
+        double_pair
+            Gradient and hessian of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+    def loss(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        int n_threads=1
+    ):
+        """Compute the point-wise loss value for each input.
+        The point-wise loss is written to `loss_out` and no array is returned.
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss_out : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        pass
+    def gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+        The gradient is written to `gradient_out` and no array is returned.
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient_out : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        pass
+    def loss_gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        """Compute loss and gradient of loss w.r.t raw_prediction.
+        The loss and gradient are written to `loss_out` and `gradient_out` and no arrays
+        are returned.
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss_out : array of shape (n_samples,) or None
+            A location into which the element-wise loss is stored.
+        gradient_out : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
+        self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
+    def gradient_hessian(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        floating_out[::1] hessian_out,          # OUT
+        int n_threads=1
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+        The gradient and hessian are written to `gradient_out` and `hessian_out` and no
+        arrays are returned.
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient_out : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        hessian_out : array of shape (n_samples,)
+            A location into which the hessian is stored.
+        n_threads : int
+            Number of threads used by OpenMP (if any).
+        """
+        pass
+{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
+{{py:
+if param is None:
+    with_param = ""
+else:
+    with_param = ", self." + param
+}}
+cdef class {{name}}(CyLossFunction):
+    """{{docstring}}"""
+    {{if param is not None}}
+    def __init__(self, {{param}}):
+        self.{{param}} = {{param}}
+    {{endif}}
+    {{if param is not None}}
+    def __reduce__(self):
+        return (self.__class__, (self.{{param}},))
+    {{endif}}
+    cdef inline double cy_loss(self, double y_true, double raw_prediction) noexcept nogil:
+        return {{closs}}(y_true, raw_prediction{{with_param}})
+    cdef inline double cy_gradient(self, double y_true, double raw_prediction) noexcept nogil:
+        return {{cgrad}}(y_true, raw_prediction{{with_param}})
+    cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) noexcept nogil:
+        return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
+    def loss(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
+    {{if closs_grad is not None}}
+    def loss_gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] loss_out,             # OUT
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double_pair dbl2
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss_out[i] = dbl2.val1
+                gradient_out[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss_out[i] = sample_weight[i] * dbl2.val1
+                gradient_out[i] = sample_weight[i] * dbl2.val2
+    {{endif}}
+    def gradient(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
+    def gradient_hessian(
+        self,
+        const floating_in[::1] y_true,          # IN
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,   # IN
+        floating_out[::1] gradient_out,         # OUT
+        floating_out[::1] hessian_out,          # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double_pair dbl2
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
+                gradient_out[i] = dbl2.val1
+                hessian_out[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
+                gradient_out[i] = sample_weight[i] * dbl2.val1
+                hessian_out[i] = sample_weight[i] * dbl2.val2
+{{endfor}}
+# The multinomial deviance loss is also known as categorical cross-entropy or
+# multinomial log-likelihood.
+# Here, we do not inherit from CyLossFunction as its cy_gradient method deviates
+# from the API.
+cdef class CyHalfMultinomialLoss():
+    """Half Multinomial deviance loss with multinomial logit link.
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded
+    Link:
+    y_pred = softmax(raw_prediction)
+    Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
+    mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
+    """
+    # Here we deviate from the CyLossFunction API. SAG/SAGA needs direct access to
+    # sample-wise gradients which we provide here.
+    cdef inline void cy_gradient(
+        self,
+        const floating_in y_true,
+        const floating_in[::1] raw_prediction,  # IN
+        const floating_in sample_weight,
+        floating_out[::1] gradient_out,         # OUT
+    ) noexcept nogil:
+        """Compute gradient of loss w.r.t. `raw_prediction` for a single sample.
+        The gradient of the multinomial logistic loss with respect to a class k,
+        and for one sample is:
+        grad_k = - sw * (p[k] - (y==k))
+        where:
+            p[k] = proba[k] = exp(raw_prediction[k] - logsumexp(raw_prediction))
+            sw = sample_weight
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : array of shape (n_classes,)
+            Raw prediction values (in link space).
+        sample_weight : double
+            Sample weight.
+        gradient_out : array of shape (n_classs,)
+            A location into which the gradient is stored.
+        Returns
+        -------
+        gradient : double
+            The derivative of the loss function w.r.t. `raw_prediction`.
+        """
+        cdef:
+            int k
+            int n_classes = raw_prediction.shape[0]
+            double_pair max_value_and_sum_exps
+            const floating_in[:, :] raw = raw_prediction[None, :]
+        max_value_and_sum_exps = sum_exp_minus_max(0, raw, &gradient_out[0])
+        for k in range(n_classes):
+            # gradient_out[k] = p_k = y_pred_k = prob of class k
+            gradient_out[k] /= max_value_and_sum_exps.val2
+            # gradient_k = (p_k - (y_true == k)) * sw
+            gradient_out[k] = (gradient_out[k] - (y_true == k)) * sample_weight
+    def _test_cy_gradient(
+        self,
+        const floating_in[::1] y_true,             # IN
+        const floating_in[:, ::1] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,      # IN
+    ):
+        """For testing only."""
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in [:, ::1] gradient_out
+        gradient = np.empty((n_samples, n_classes), dtype=np.float64)
+        gradient_out = gradient
+        for i in range(n_samples):
+            self.cy_gradient(
+                y_true=y_true[i],
+                raw_prediction=raw_prediction[i, :],
+                sample_weight=1.0 if sample_weight is None else sample_weight[i],
+                gradient_out=gradient_out[i, :],
+            )
+        return gradient
+    # Note that we do not assume memory alignment/contiguity of 2d arrays.
+    # There seems to be little benefit in doing so. Benchmarks proofing the
+    # opposite are welcome.
+    def loss(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[::1] loss_out,              # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in max_value, sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+        # We assume n_samples > n_classes. In this case having the inner loop
+        # over n_classes is a good default.
+        # TODO: If every memoryview is contiguous and raw_prediction is
+        #       f-contiguous, can we write a better algo (loops) to improve
+        #       performance?
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+                    # label encoded y_true
+                    k = int(y_true[i])
+                    loss_out[i] -= raw_prediction[i, k]
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+                    # label encoded y_true
+                    k = int(y_true[i])
+                    loss_out[i] -= raw_prediction[i, k]
+                    loss_out[i] *= sample_weight[i]
+                free(p)
+    def loss_gradient(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[::1] loss_out,              # OUT
+        floating_out[:, :] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in max_value, sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss_out[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = p_k - (y_true == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = max_value_and_sum_exps.val1
+                    sum_exps = max_value_and_sum_exps.val2
+                    loss_out[i] = log(sum_exps) + max_value
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss_out[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                    loss_out[i] *= sample_weight[i]
+                free(p)
+    def gradient(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                free(p)
+    def gradient_hessian(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        floating_out[:, :] hessian_out,          # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in sum_exps
+            floating_in* p  # temporary buffer
+            double_pair max_value_and_sum_exps
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # hessian_k = p_k * (1 - p_k)
+                        # gradient_k = p_k - (y_true == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+                        hessian_out[i, k] = p[k] * (1. - p[k])
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        # hessian_k = p_k * (1 - p_k) * sw
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
+                free(p)
+    # This method simplifies the implementation of hessp in linear models,
+    # i.e. the matrix-vector product of the full hessian, not only of the
+    # diagonal (in the classes) approximation as implemented above.
+    def gradient_proba(
+        self,
+        const floating_in[::1] y_true,           # IN
+        const floating_in[:, :] raw_prediction,  # IN
+        const floating_in[::1] sample_weight,    # IN
+        floating_out[:, :] gradient_out,         # OUT
+        floating_out[:, :] proba_out,            # OUT
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            floating_in sum_exps
+            floating_in*  p  # temporary buffer
+            double_pair max_value_and_sum_exps
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+                    for k in range(n_classes):
+                        proba_out[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k)
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <floating_in *> malloc(sizeof(floating_in) * (n_classes))
+                for i in prange(n_samples, schedule='static'):
+                    max_value_and_sum_exps = sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = max_value_and_sum_exps.val2
+                    for k in range(n_classes):
+                        proba_out[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]
+                free(p)

.venv/lib/python3.12/site-packages/sklearn/_loss/link.py ADDED Viewed

	@@ -0,0 +1,282 @@

+"""
+Module contains classes for invertible (and differentiable) link functions.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+import numpy as np
+from scipy.special import expit, logit
+from scipy.stats import gmean
+from ..utils.extmath import softmax
+@dataclass
+class Interval:
+    low: float
+    high: float
+    low_inclusive: bool
+    high_inclusive: bool
+    def __post_init__(self):
+        """Check that low <= high"""
+        if self.low > self.high:
+            raise ValueError(
+                f"One must have low <= high; got low={self.low}, high={self.high}."
+            )
+    def includes(self, x):
+        """Test whether all values of x are in interval range.
+        Parameters
+        ----------
+        x : ndarray
+            Array whose elements are tested to be in interval range.
+        Returns
+        -------
+        result : bool
+        """
+        if self.low_inclusive:
+            low = np.greater_equal(x, self.low)
+        else:
+            low = np.greater(x, self.low)
+        if not np.all(low):
+            return False
+        if self.high_inclusive:
+            high = np.less_equal(x, self.high)
+        else:
+            high = np.less(x, self.high)
+        # Note: np.all returns numpy.bool_
+        return bool(np.all(high))
+def _inclusive_low_high(interval, dtype=np.float64):
+    """Generate values low and high to be within the interval range.
+    This is used in tests only.
+    Returns
+    -------
+    low, high : tuple
+        The returned values low and high lie within the interval.
+    """
+    eps = 10 * np.finfo(dtype).eps
+    if interval.low == -np.inf:
+        low = -1e10
+    elif interval.low < 0:
+        low = interval.low * (1 - eps) + eps
+    else:
+        low = interval.low * (1 + eps) + eps
+    if interval.high == np.inf:
+        high = 1e10
+    elif interval.high < 0:
+        high = interval.high * (1 + eps) - eps
+    else:
+        high = interval.high * (1 - eps) - eps
+    return low, high
+class BaseLink(ABC):
+    """Abstract base class for differentiable, invertible link functions.
+    Convention:
+        - link function g: raw_prediction = g(y_pred)
+        - inverse link h: y_pred = h(raw_prediction)
+    For (generalized) linear models, `raw_prediction = X @ coef` is the so
+    called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
+    conditional (on X) expected value of the target `y_true`.
+    The methods are not implemented as staticmethods in case a link function needs
+    parameters.
+    """
+    is_multiclass = False  # used for testing only
+    # Usually, raw_prediction may be any real number and y_pred is an open
+    # interval.
+    # interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
+    interval_y_pred = Interval(-np.inf, np.inf, False, False)
+    @abstractmethod
+    def link(self, y_pred, out=None):
+        """Compute the link function g(y_pred).
+        The link function maps (predicted) target values to raw predictions,
+        i.e. `g(y_pred) = raw_prediction`.
+        Parameters
+        ----------
+        y_pred : array
+            Predicted target values.
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+        Returns
+        -------
+        out : array
+            Output array, element-wise link function.
+        """
+    @abstractmethod
+    def inverse(self, raw_prediction, out=None):
+        """Compute the inverse link function h(raw_prediction).
+        The inverse link function maps raw predictions to predicted target
+        values, i.e. `h(raw_prediction) = y_pred`.
+        Parameters
+        ----------
+        raw_prediction : array
+            Raw prediction values (in link space).
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+        Returns
+        -------
+        out : array
+            Output array, element-wise inverse link function.
+        """
+class IdentityLink(BaseLink):
+    """The identity link function g(x)=x."""
+    def link(self, y_pred, out=None):
+        if out is not None:
+            np.copyto(out, y_pred)
+            return out
+        else:
+            return y_pred
+    inverse = link
+class LogLink(BaseLink):
+    """The log link function g(x)=log(x)."""
+    interval_y_pred = Interval(0, np.inf, False, False)
+    def link(self, y_pred, out=None):
+        return np.log(y_pred, out=out)
+    def inverse(self, raw_prediction, out=None):
+        return np.exp(raw_prediction, out=out)
+class LogitLink(BaseLink):
+    """The logit link function g(x)=logit(x)."""
+    interval_y_pred = Interval(0, 1, False, False)
+    def link(self, y_pred, out=None):
+        return logit(y_pred, out=out)
+    def inverse(self, raw_prediction, out=None):
+        return expit(raw_prediction, out=out)
+class HalfLogitLink(BaseLink):
+    """Half the logit link function g(x)=1/2 * logit(x).
+    Used for the exponential loss.
+    """
+    interval_y_pred = Interval(0, 1, False, False)
+    def link(self, y_pred, out=None):
+        out = logit(y_pred, out=out)
+        out *= 0.5
+        return out
+    def inverse(self, raw_prediction, out=None):
+        return expit(2 * raw_prediction, out)
+class MultinomialLogit(BaseLink):
+    """The symmetric multinomial logit function.
+    Convention:
+        - y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+    Notes:
+        - The inverse link h is the softmax function.
+        - The sum is over the second axis, i.e. axis=1 (n_classes).
+    We have to choose additional constraints in order to make
+        y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
+    for n_classes classes identifiable and invertible.
+    We choose the symmetric side constraint where the geometric mean response
+    is set as reference category, see [2]:
+    The symmetric multinomial logit link function for a single data point is
+    then defined as
+        raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
+        = log(y_pred[k]) - mean(log(y_pred)).
+    Note that this is equivalent to the definition in [1] and implies mean
+    centered raw predictions:
+        sum(raw_prediction[k], k=0..n_classes-1) = 0.
+    For linear models with raw_prediction = X @ coef, this corresponds to
+    sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
+    feature is zero.
+    Reference
+    ---------
+    .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
+        logistic regression: a statistical view of boosting" Ann. Statist.
+        28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
+        https://projecteuclid.org/euclid.aos/1016218223
+    .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
+        multinomial logit models with symmetric side constraints."
+        Computational Statistics 28 (2013): 1017-1034.
+        http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
+    """
+    is_multiclass = True
+    interval_y_pred = Interval(0, 1, False, False)
+    def symmetrize_raw_prediction(self, raw_prediction):
+        return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
+    def link(self, y_pred, out=None):
+        # geometric mean as reference category
+        gm = gmean(y_pred, axis=1)
+        return np.log(y_pred / gm[:, np.newaxis], out=out)
+    def inverse(self, raw_prediction, out=None):
+        if out is None:
+            return softmax(raw_prediction, copy=True)
+        else:
+            np.copyto(out, raw_prediction)
+            softmax(out, copy=False)
+            return out
+_LINKS = {
+    "identity": IdentityLink,
+    "log": LogLink,
+    "logit": LogitLink,
+    "half_logit": HalfLogitLink,
+    "multinomial_logit": MultinomialLogit,
+}

.venv/lib/python3.12/site-packages/sklearn/_loss/loss.py ADDED Viewed

	@@ -0,0 +1,1181 @@

+"""
+This module contains loss classes suitable for fitting.
+It is not part of the public API.
+Specific losses are used for regression, binary classification or multiclass
+classification.
+"""
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+# Goals:
+# - Provide a common private module for loss functions/classes.
+# - To be used in:
+#   - LogisticRegression
+#   - PoissonRegressor, GammaRegressor, TweedieRegressor
+#   - HistGradientBoostingRegressor, HistGradientBoostingClassifier
+#   - GradientBoostingRegressor, GradientBoostingClassifier
+#   - SGDRegressor, SGDClassifier
+# - Replace link module of GLMs.
+import numbers
+import numpy as np
+from scipy.special import xlogy
+from ..utils import check_scalar
+from ..utils.stats import _weighted_percentile
+from ._loss import (
+    CyAbsoluteError,
+    CyExponentialLoss,
+    CyHalfBinomialLoss,
+    CyHalfGammaLoss,
+    CyHalfMultinomialLoss,
+    CyHalfPoissonLoss,
+    CyHalfSquaredError,
+    CyHalfTweedieLoss,
+    CyHalfTweedieLossIdentity,
+    CyHuberLoss,
+    CyPinballLoss,
+)
+from .link import (
+    HalfLogitLink,
+    IdentityLink,
+    Interval,
+    LogitLink,
+    LogLink,
+    MultinomialLogit,
+)
+# Note: The shape of raw_prediction for multiclass classifications are
+# - GradientBoostingClassifier: (n_samples, n_classes)
+# - HistGradientBoostingClassifier: (n_classes, n_samples)
+#
+# Note: Instead of inheritance like
+#
+#    class BaseLoss(BaseLink, CyLossFunction):
+#    ...
+#
+#    # Note: Naturally, we would inherit in the following order
+#    #     class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#    #   But because of https://github.com/cython/cython/issues/4350 we set BaseLoss as
+#    #   the last one. This, of course, changes the MRO.
+#    class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
+#
+# we use composition. This way we improve maintainability by avoiding the above
+# mentioned Cython edge case and have easier to understand code (which method calls
+# which code).
+class BaseLoss:
+    """Base class for a loss function of 1-dimensional targets.
+    Conventions:
+        - y_true.shape = sample_weight.shape = (n_samples,)
+        - y_pred.shape = raw_prediction.shape = (n_samples,)
+        - If is_multiclass is true (multiclass classification), then
+          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+          Note that this corresponds to the return value of decision_function.
+    y_true, y_pred, sample_weight and raw_prediction must either be all float64
+    or all float32.
+    gradient and hessian must be either both float64 or both float32.
+    Note that y_pred = link.inverse(raw_prediction).
+    Specific loss classes can inherit specific link classes to satisfy
+    BaseLink's abstractmethods.
+    Parameters
+    ----------
+    sample_weight : {None, ndarray}
+        If sample_weight is None, the hessian might be constant.
+    n_classes : {None, int}
+        The number of classes for classification, else None.
+    Attributes
+    ----------
+    closs: CyLossFunction
+    link : BaseLink
+    interval_y_true : Interval
+        Valid interval for y_true
+    interval_y_pred : Interval
+        Valid Interval for y_pred
+    differentiable : bool
+        Indicates whether or not loss function is differentiable in
+        raw_prediction everywhere.
+    need_update_leaves_values : bool
+        Indicates whether decision trees in gradient boosting need to uptade
+        leave values after having been fit to the (negative) gradients.
+    approx_hessian : bool
+        Indicates whether the hessian is approximated or exact. If,
+        approximated, it should be larger or equal to the exact one.
+    constant_hessian : bool
+        Indicates whether the hessian is one for this loss.
+    is_multiclass : bool
+        Indicates whether n_classes > 2 is allowed.
+    """
+    # For gradient boosted decision trees:
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    differentiable = True
+    need_update_leaves_values = False
+    is_multiclass = False
+    def __init__(self, closs, link, n_classes=None):
+        self.closs = closs
+        self.link = link
+        self.approx_hessian = False
+        self.constant_hessian = False
+        self.n_classes = n_classes
+        self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        self.interval_y_pred = self.link.interval_y_pred
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return self.interval_y_true.includes(y)
+    def in_y_pred_range(self, y):
+        """Return True if y is in the valid range of y_pred.
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return self.interval_y_pred.includes(y)
+    def loss(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss_out=None,
+        n_threads=1,
+    ):
+        """Compute the pointwise loss value for each input.
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss_out : None or C-contiguous array of shape (n_samples,)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        """
+        if loss_out is None:
+            loss_out = np.empty_like(y_true)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        self.closs.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss_out=loss_out,
+            n_threads=n_threads,
+        )
+        return loss_out
+    def loss_gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss_out=None,
+        gradient_out=None,
+        n_threads=1,
+    ):
+        """Compute loss and gradient w.r.t. raw_prediction for each input.
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss_out : None or C-contiguous array of shape (n_samples,)
+            A location into which the loss is stored. If None, a new array
+            might be created.
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if loss_out is None:
+            if gradient_out is None:
+                loss_out = np.empty_like(y_true)
+                gradient_out = np.empty_like(raw_prediction)
+            else:
+                loss_out = np.empty_like(y_true, dtype=gradient_out.dtype)
+        elif gradient_out is None:
+            gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+        self.closs.loss_gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss_out=loss_out,
+            gradient_out=gradient_out,
+            n_threads=n_threads,
+        )
+        return loss_out, gradient_out
+    def gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient_out=None,
+        n_threads=1,
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+        Returns
+        -------
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if gradient_out is None:
+            gradient_out = np.empty_like(raw_prediction)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+        self.closs.gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=gradient_out,
+            n_threads=n_threads,
+        )
+        return gradient_out
+    def gradient_hessian(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient_out=None,
+        hessian_out=None,
+        n_threads=1,
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian_out : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+        Returns
+        -------
+        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise hessians.
+        """
+        if gradient_out is None:
+            if hessian_out is None:
+                gradient_out = np.empty_like(raw_prediction)
+                hessian_out = np.empty_like(raw_prediction)
+            else:
+                gradient_out = np.empty_like(hessian_out)
+        elif hessian_out is None:
+            hessian_out = np.empty_like(gradient_out)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+        if hessian_out.ndim == 2 and hessian_out.shape[1] == 1:
+            hessian_out = hessian_out.squeeze(1)
+        self.closs.gradient_hessian(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=gradient_out,
+            hessian_out=hessian_out,
+            n_threads=n_threads,
+        )
+        return gradient_out, hessian_out
+    def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
+        """Compute the weighted average loss.
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+        Returns
+        -------
+        loss : float
+            Mean or averaged loss function.
+        """
+        return np.average(
+            self.loss(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                loss_out=None,
+                n_threads=n_threads,
+            ),
+            weights=sample_weight,
+        )
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+        This can be used as initial estimates of predictions, i.e. before the
+        first iteration in fit.
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or array of shape (n_samples,)
+            Sample weights.
+        Returns
+        -------
+        raw_prediction : numpy scalar or array of shape (n_classes,)
+            Raw predictions of an intercept-only model.
+        """
+        # As default, take weighted average of the target over the samples
+        # axis=0 and then transform into link-scale (raw_prediction).
+        y_pred = np.average(y_true, weights=sample_weight, axis=0)
+        eps = 10 * np.finfo(y_pred.dtype).eps
+        if self.interval_y_pred.low == -np.inf:
+            a_min = None
+        elif self.interval_y_pred.low_inclusive:
+            a_min = self.interval_y_pred.low
+        else:
+            a_min = self.interval_y_pred.low + eps
+        if self.interval_y_pred.high == np.inf:
+            a_max = None
+        elif self.interval_y_pred.high_inclusive:
+            a_max = self.interval_y_pred.high
+        else:
+            a_max = self.interval_y_pred.high - eps
+        if a_min is None and a_max is None:
+            return self.link.link(y_pred)
+        else:
+            return self.link.link(np.clip(y_pred, a_min, a_max))
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        """Calculate term dropped in loss.
+        With this term added, the loss of perfect predictions is zero.
+        """
+        return np.zeros_like(y_true)
+    def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
+        """Initialize arrays for gradients and hessians.
+        Unless hessians are constant, arrays are initialized with undefined values.
+        Parameters
+        ----------
+        n_samples : int
+            The number of samples, usually passed to `fit()`.
+        dtype : {np.float64, np.float32}, default=np.float64
+            The dtype of the arrays gradient and hessian.
+        order : {'C', 'F'}, default='F'
+            Order of the arrays gradient and hessian. The default 'F' makes the arrays
+            contiguous along samples.
+        Returns
+        -------
+        gradient : C-contiguous array of shape (n_samples,) or array of shape \
+            (n_samples, n_classes)
+            Empty array (allocated but not initialized) to be used as argument
+            gradient_out.
+        hessian : C-contiguous array of shape (n_samples,), array of shape
+            (n_samples, n_classes) or shape (1,)
+            Empty (allocated but not initialized) array to be used as argument
+            hessian_out.
+            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
+            initialized to ``1``.
+        """
+        if dtype not in (np.float32, np.float64):
+            raise ValueError(
+                "Valid options for 'dtype' are np.float32 and np.float64. "
+                f"Got dtype={dtype} instead."
+            )
+        if self.is_multiclass:
+            shape = (n_samples, self.n_classes)
+        else:
+            shape = (n_samples,)
+        gradient = np.empty(shape=shape, dtype=dtype, order=order)
+        if self.constant_hessian:
+            # If the hessians are constant, we consider them equal to 1.
+            # - This is correct for HalfSquaredError
+            # - For AbsoluteError, hessians are actually 0, but they are
+            #   always ignored anyway.
+            hessian = np.ones(shape=(1,), dtype=dtype)
+        else:
+            hessian = np.empty(shape=shape, dtype=dtype, order=order)
+        return gradient, hessian
+# Note: Naturally, we would inherit in the following order
+#         class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#       But because of https://github.com/cython/cython/issues/4350 we
+#       set BaseLoss as the last one. This, of course, changes the MRO.
+class HalfSquaredError(BaseLoss):
+    """Half squared error with identity link, for regression.
+    Domain:
+    y_true and y_pred all real numbers
+    Link:
+    y_pred = raw_prediction
+    For a given sample x_i, half squared error is defined as::
+        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
+    The factor of 0.5 simplifies the computation of gradients and results in a
+    unit hessian (and is consistent with what is done in LightGBM). It is also
+    half the Normal distribution deviance.
+    """
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyHalfSquaredError(), link=IdentityLink())
+        self.constant_hessian = sample_weight is None
+class AbsoluteError(BaseLoss):
+    """Absolute error with identity link, for regression.
+    Domain:
+    y_true and y_pred all real numbers
+    Link:
+    y_pred = raw_prediction
+    For a given sample x_i, the absolute error is defined as::
+        loss(x_i) = |y_true_i - raw_prediction_i|
+    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
+    differentiable = False). Optimization routines like in HGBT, however, need a
+    hessian > 0. Therefore, we assign 1.
+    """
+    differentiable = False
+    need_update_leaves_values = True
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyAbsoluteError(), link=IdentityLink())
+        self.approx_hessian = True
+        self.constant_hessian = sample_weight is None
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.median(y_true, axis=0)
+        else:
+            return _weighted_percentile(y_true, sample_weight, 50)
+class PinballLoss(BaseLoss):
+    """Quantile loss aka pinball loss, for regression.
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+    Link:
+    y_pred = raw_prediction
+    For a given sample x_i, the pinball loss is defined as::
+        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
+        rho_{quantile}(u) = u * (quantile - 1_{u<0})
+                          = -u *(1 - quantile)  if u < 0
+                             u * quantile       if u >= 0
+    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
+    Note that the exact hessian = 0 almost everywhere (except at one point, therefore
+    differentiable = False). Optimization routines like in HGBT, however, need a
+    hessian > 0. Therefore, we assign 1.
+    Additional Attributes
+    ---------------------
+    quantile : float
+        The quantile level of the quantile to be estimated. Must be in range (0, 1).
+    """
+    differentiable = False
+    need_update_leaves_values = True
+    def __init__(self, sample_weight=None, quantile=0.5):
+        check_scalar(
+            quantile,
+            "quantile",
+            target_type=numbers.Real,
+            min_val=0,
+            max_val=1,
+            include_boundaries="neither",
+        )
+        super().__init__(
+            closs=CyPinballLoss(quantile=float(quantile)),
+            link=IdentityLink(),
+        )
+        self.approx_hessian = True
+        self.constant_hessian = sample_weight is None
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.percentile(y_true, 100 * self.closs.quantile, axis=0)
+        else:
+            return _weighted_percentile(
+                y_true, sample_weight, 100 * self.closs.quantile
+            )
+class HuberLoss(BaseLoss):
+    """Huber loss, for regression.
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+    Link:
+    y_pred = raw_prediction
+    For a given sample x_i, the Huber loss is defined as::
+        loss(x_i) = 1/2 * abserr**2            if abserr <= delta
+                    delta * (abserr - delta/2) if abserr > delta
+        abserr = |y_true_i - raw_prediction_i|
+        delta = quantile(abserr, self.quantile)
+    Note: HuberLoss(quantile=1) equals HalfSquaredError and HuberLoss(quantile=0)
+    equals delta * (AbsoluteError() - delta/2).
+    Additional Attributes
+    ---------------------
+    quantile : float
+        The quantile level which defines the breaking point `delta` to distinguish
+        between absolute error and squared error. Must be in range (0, 1).
+     Reference
+    ---------
+    .. [1] Friedman, J.H. (2001). :doi:`Greedy function approximation: A gradient
+      boosting machine <10.1214/aos/1013203451>`.
+      Annals of Statistics, 29, 1189-1232.
+    """
+    differentiable = False
+    need_update_leaves_values = True
+    def __init__(self, sample_weight=None, quantile=0.9, delta=0.5):
+        check_scalar(
+            quantile,
+            "quantile",
+            target_type=numbers.Real,
+            min_val=0,
+            max_val=1,
+            include_boundaries="neither",
+        )
+        self.quantile = quantile  # This is better stored outside of Cython.
+        super().__init__(
+            closs=CyHuberLoss(delta=float(delta)),
+            link=IdentityLink(),
+        )
+        self.approx_hessian = True
+        self.constant_hessian = False
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        # See formula before algo 4 in Friedman (2001), but we apply it to y_true,
+        # not to the residual y_true - raw_prediction. An estimator like
+        # HistGradientBoostingRegressor might then call it on the residual, e.g.
+        # fit_intercept_only(y_true - raw_prediction).
+        if sample_weight is None:
+            median = np.percentile(y_true, 50, axis=0)
+        else:
+            median = _weighted_percentile(y_true, sample_weight, 50)
+        diff = y_true - median
+        term = np.sign(diff) * np.minimum(self.closs.delta, np.abs(diff))
+        return median + np.average(term, weights=sample_weight)
+class HalfPoissonLoss(BaseLoss):
+    """Half Poisson deviance loss with log-link, for regression.
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+    Link:
+    y_pred = exp(raw_prediction)
+    For a given sample x_i, half the Poisson deviance is defined as::
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
+                    - y_true_i + exp(raw_prediction_i)
+    Half the Poisson deviance is actually the negative log-likelihood up to
+    constant terms (not involving raw_prediction) and simplifies the
+    computation of the gradients.
+    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
+    """
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyHalfPoissonLoss(), link=LogLink())
+        self.interval_y_true = Interval(0, np.inf, True, False)
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = xlogy(y_true, y_true) - y_true
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+class HalfGammaLoss(BaseLoss):
+    """Half Gamma deviance loss with log-link, for regression.
+    Domain:
+    y_true and y_pred in positive real numbers
+    Link:
+    y_pred = exp(raw_prediction)
+    For a given sample x_i, half Gamma deviance loss is defined as::
+        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
+                    + y_true/exp(raw_prediction_i) - 1
+    Half the Gamma deviance is actually proportional to the negative log-
+    likelihood up to constant terms (not involving raw_prediction) and
+    simplifies the computation of the gradients.
+    We also skip the constant term `-log(y_true_i) - 1`.
+    """
+    def __init__(self, sample_weight=None):
+        super().__init__(closs=CyHalfGammaLoss(), link=LogLink())
+        self.interval_y_true = Interval(0, np.inf, False, False)
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = -np.log(y_true) - 1
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+class HalfTweedieLoss(BaseLoss):
+    """Half Tweedie deviance loss with log-link, for regression.
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers
+    power in real numbers
+    Link:
+    y_pred = exp(raw_prediction)
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
+                    + exp(raw_prediction_i)**(2-p) / (2-p)
+    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
+    HalfPoissonLoss and HalfGammaLoss.
+    We also skip constant terms, but those are different for p=0, 1, 2.
+    Therefore, the loss is not continuous in `power`.
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+    def __init__(self, sample_weight=None, power=1.5):
+        super().__init__(
+            closs=CyHalfTweedieLoss(power=float(power)),
+            link=LogLink(),
+        )
+        if self.closs.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.closs.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        if self.closs.power == 0:
+            return HalfSquaredError().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.closs.power == 1:
+            return HalfPoissonLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.closs.power == 2:
+            return HalfGammaLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        else:
+            p = self.closs.power
+            term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
+            if sample_weight is not None:
+                term *= sample_weight
+            return term
+class HalfTweedieLossIdentity(BaseLoss):
+    """Half Tweedie deviance loss with identity link, for regression.
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers for power != 0
+    y_pred in real numbers for power = 0
+    power in real numbers
+    Link:
+    y_pred = raw_prediction
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * raw_prediction_i**(1-p) / (1-p)
+                    + raw_prediction_i**(2-p) / (2-p)
+    Note that the minimum value of this loss is 0.
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+    def __init__(self, sample_weight=None, power=1.5):
+        super().__init__(
+            closs=CyHalfTweedieLossIdentity(power=float(power)),
+            link=IdentityLink(),
+        )
+        if self.closs.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.closs.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+        if self.closs.power == 0:
+            self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+        else:
+            self.interval_y_pred = Interval(0, np.inf, False, False)
+class HalfBinomialLoss(BaseLoss):
+    """Half Binomial deviance loss with logit link, for binary classification.
+    This is also know as binary cross entropy, log-loss and logistic loss.
+    Domain:
+    y_true in [0, 1], i.e. regression on the unit interval
+    y_pred in (0, 1), i.e. boundaries excluded
+    Link:
+    y_pred = expit(raw_prediction)
+    For a given sample x_i, half Binomial deviance is defined as the negative
+    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
+    as::
+        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
+    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
+    section 4.4.1 (about logistic regression).
+    Note that the formulation works for classification, y = {0, 1}, as well as
+    logistic regression, y = [0, 1].
+    If you add `constant_to_optimal_zero` to the loss, you get half the
+    Bernoulli/binomial deviance.
+    More details: Inserting the predicted probability y_pred = expit(raw_prediction)
+    in the loss gives the well known::
+        loss(x_i) = - y_true_i * log(y_pred_i) - (1 - y_true_i) * log(1 - y_pred_i)
+    """
+    def __init__(self, sample_weight=None):
+        super().__init__(
+            closs=CyHalfBinomialLoss(),
+            link=LogitLink(),
+            n_classes=2,
+        )
+        self.interval_y_true = Interval(0, 1, True, True)
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        # This is non-zero only if y_true is neither 0 nor 1.
+        term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true)
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+    def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
+            Raw prediction values (in link space).
+        Returns
+        -------
+        proba : array of shape (n_samples, 2)
+            Element-wise class probabilities.
+        """
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
+        proba[:, 1] = self.link.inverse(raw_prediction)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+class HalfMultinomialLoss(BaseLoss):
+    """Categorical cross-entropy loss, for multiclass classification.
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred has n_classes elements, each element in (0, 1)
+    Link:
+    y_pred = softmax(raw_prediction)
+    Note: We assume y_true to be already label encoded. The inverse link is
+    softmax. But the full link function is the symmetric multinomial logit
+    function.
+    For a given sample x_i, the categorical cross-entropy loss is defined as
+    the negative log-likelihood of the multinomial distribution, it
+    generalizes the binary cross-entropy to more than 2 classes::
+        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
+                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
+    See [1].
+    Note that for the hessian, we calculate only the diagonal part in the
+    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
+    we calculate H_i_k_k, i.e. k=l.
+    Reference
+    ---------
+    .. [1] :arxiv:`Simon, Noah, J. Friedman and T. Hastie.
+        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
+        Multinomial Regression".
+        <1311.6529>`
+    """
+    is_multiclass = True
+    def __init__(self, sample_weight=None, n_classes=3):
+        super().__init__(
+            closs=CyHalfMultinomialLoss(),
+            link=MultinomialLogit(),
+            n_classes=n_classes,
+        )
+        self.interval_y_true = Interval(0, np.inf, True, False)
+        self.interval_y_pred = Interval(0, 1, False, False)
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return self.interval_y_true.includes(y) and np.all(y.astype(int) == y)
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+        This is the softmax of the weighted average of the target, i.e. over
+        the samples axis=0.
+        """
+        out = np.zeros(self.n_classes, dtype=y_true.dtype)
+        eps = np.finfo(y_true.dtype).eps
+        for k in range(self.n_classes):
+            out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
+            out[k] = np.clip(out[k], eps, 1 - eps)
+        return self.link.link(out[None, :]).reshape(-1)
+    def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        Returns
+        -------
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilities.
+        """
+        return self.link.inverse(raw_prediction)
+    def gradient_proba(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient_out=None,
+        proba_out=None,
+        n_threads=1,
+    ):
+        """Compute gradient and class probabilities fow raw_prediction.
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient_out : None or array of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        proba_out : None or array of shape (n_samples, n_classes)
+            A location into which the class probabilities are stored. If None,
+            a new array might be created.
+        n_threads : int, default=1
+            Might use openmp thread parallelism.
+        Returns
+        -------
+        gradient : array of shape (n_samples, n_classes)
+            Element-wise gradients.
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilities.
+        """
+        if gradient_out is None:
+            if proba_out is None:
+                gradient_out = np.empty_like(raw_prediction)
+                proba_out = np.empty_like(raw_prediction)
+            else:
+                gradient_out = np.empty_like(proba_out)
+        elif proba_out is None:
+            proba_out = np.empty_like(gradient_out)
+        self.closs.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient_out=gradient_out,
+            proba_out=proba_out,
+            n_threads=n_threads,
+        )
+        return gradient_out, proba_out
+class ExponentialLoss(BaseLoss):
+    """Exponential loss with (half) logit link, for binary classification.
+    This is also know as boosting loss.
+    Domain:
+    y_true in [0, 1], i.e. regression on the unit interval
+    y_pred in (0, 1), i.e. boundaries excluded
+    Link:
+    y_pred = expit(2 * raw_prediction)
+    For a given sample x_i, the exponential loss is defined as::
+        loss(x_i) = y_true_i * exp(-raw_pred_i)) + (1 - y_true_i) * exp(raw_pred_i)
+    See:
+    - J. Friedman, T. Hastie, R. Tibshirani.
+      "Additive logistic regression: a statistical view of boosting (With discussion
+      and a rejoinder by the authors)." Ann. Statist. 28 (2) 337 - 407, April 2000.
+      https://doi.org/10.1214/aos/1016218223
+    - A. Buja, W. Stuetzle, Y. Shen. (2005).
+      "Loss Functions for Binary Class Probability Estimation and Classification:
+      Structure and Applications."
+    Note that the formulation works for classification, y = {0, 1}, as well as
+    "exponential logistic" regression, y = [0, 1].
+    Note that this is a proper scoring rule, but without it's canonical link.
+    More details: Inserting the predicted probability
+    y_pred = expit(2 * raw_prediction) in the loss gives::
+        loss(x_i) = y_true_i * sqrt((1 - y_pred_i) / y_pred_i)
+            + (1 - y_true_i) * sqrt(y_pred_i / (1 - y_pred_i))
+    """
+    def __init__(self, sample_weight=None):
+        super().__init__(
+            closs=CyExponentialLoss(),
+            link=HalfLogitLink(),
+            n_classes=2,
+        )
+        self.interval_y_true = Interval(0, 1, True, True)
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        # This is non-zero only if y_true is neither 0 nor 1.
+        term = -2 * np.sqrt(y_true * (1 - y_true))
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+    def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
+            Raw prediction values (in link space).
+        Returns
+        -------
+        proba : array of shape (n_samples, 2)
+            Element-wise class probabilities.
+        """
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
+        proba[:, 1] = self.link.inverse(raw_prediction)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+_LOSSES = {
+    "squared_error": HalfSquaredError,
+    "absolute_error": AbsoluteError,
+    "pinball_loss": PinballLoss,
+    "huber_loss": HuberLoss,
+    "poisson_loss": HalfPoissonLoss,
+    "gamma_loss": HalfGammaLoss,
+    "tweedie_loss": HalfTweedieLoss,
+    "binomial_loss": HalfBinomialLoss,
+    "multinomial_loss": HalfMultinomialLoss,
+    "exponential_loss": ExponentialLoss,
+}

.venv/lib/python3.12/site-packages/sklearn/_loss/meson.build ADDED Viewed

	@@ -0,0 +1,23 @@

+# .pyx is generated, so this is needed to make Cython compilation work
+_loss_cython_tree = [
+  fs.copyfile('_loss.pxd')
+]
+_loss_pyx = custom_target(
+  '_loss_pyx',
+  output: '_loss.pyx',
+  input: '_loss.pyx.tp',
+  command: [tempita, '@INPUT@', '-o', '@OUTDIR@'],
+  # TODO in principle this should go in py.exension_module below. This is
+  # temporary work-around for dependency issue with .pyx.tp files. For more
+  # details, see https://github.com/mesonbuild/meson/issues/13212
+  depends: _loss_cython_tree,
+)
+py.extension_module(
+  '_loss',
+  cython_gen.process(_loss_pyx),
+  dependencies: [openmp_dep],
+  install: true,
+  subdir: 'sklearn/_loss',
+)